mirror of https://github.com/explosion/spaCy.git
Tidy up gold
This commit is contained in:
parent
6a0483b7aa
commit
a6135336f5
|
@ -54,7 +54,8 @@ def merge_sents(sents):
|
||||||
m_deps[3].extend(head + i for head in heads)
|
m_deps[3].extend(head + i for head in heads)
|
||||||
m_deps[4].extend(labels)
|
m_deps[4].extend(labels)
|
||||||
m_deps[5].extend(ner)
|
m_deps[5].extend(ner)
|
||||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
|
||||||
|
for b in brackets)
|
||||||
i += len(ids)
|
i += len(ids)
|
||||||
return [(m_deps, m_brackets)]
|
return [(m_deps, m_brackets)]
|
||||||
|
|
||||||
|
@ -80,6 +81,8 @@ def align(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
punct_re = re.compile(r'\W')
|
punct_re = re.compile(r'\W')
|
||||||
|
|
||||||
|
|
||||||
def _min_edit_path(cand_words, gold_words):
|
def _min_edit_path(cand_words, gold_words):
|
||||||
cdef:
|
cdef:
|
||||||
Pool mem
|
Pool mem
|
||||||
|
@ -98,9 +101,9 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
mem = Pool()
|
mem = Pool()
|
||||||
n_cand = len(cand_words)
|
n_cand = len(cand_words)
|
||||||
n_gold = len(gold_words)
|
n_gold = len(gold_words)
|
||||||
# Levenshtein distance, except we need the history, and we may want different
|
# Levenshtein distance, except we need the history, and we may want
|
||||||
# costs.
|
# different costs. Mark operations with a string, and score the history
|
||||||
# Mark operations with a string, and score the history using _edit_cost.
|
# using _edit_cost.
|
||||||
previous_row = []
|
previous_row = []
|
||||||
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||||
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||||
|
@ -144,9 +147,9 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
def minibatch(items, size=8):
|
def minibatch(items, size=8):
|
||||||
'''Iterate over batches of items. `size` may be an iterator,
|
"""Iterate over batches of items. `size` may be an iterator,
|
||||||
so that batch-size can vary on each step.
|
so that batch-size can vary on each step.
|
||||||
'''
|
"""
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = itertools.repeat(8)
|
size_ = itertools.repeat(8)
|
||||||
else:
|
else:
|
||||||
|
@ -168,6 +171,7 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
train_path (unicode or Path): File or directory of training data.
|
train_path (unicode or Path): File or directory of training data.
|
||||||
dev_path (unicode or Path): File or directory of development data.
|
dev_path (unicode or Path): File or directory of development data.
|
||||||
|
RETURNS (GoldCorpus): The newly created object.
|
||||||
"""
|
"""
|
||||||
self.train_path = util.ensure_path(train_path)
|
self.train_path = util.ensure_path(train_path)
|
||||||
self.dev_path = util.ensure_path(dev_path)
|
self.dev_path = util.ensure_path(dev_path)
|
||||||
|
@ -222,7 +226,6 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False):
|
def dev_docs(self, nlp, gold_preproc=False):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||||
#gold_docs = nlp.preprocess_gold(gold_docs)
|
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -233,7 +236,6 @@ class GoldCorpus(object):
|
||||||
raw_text = None
|
raw_text = None
|
||||||
else:
|
else:
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||||
|
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||||
gold_preproc, noise_level=noise_level)
|
gold_preproc, noise_level=noise_level)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples)
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
|
@ -248,17 +250,20 @@ class GoldCorpus(object):
|
||||||
raw_text = add_noise(raw_text, noise_level)
|
raw_text = add_noise(raw_text, noise_level)
|
||||||
return [nlp.make_doc(raw_text)]
|
return [nlp.make_doc(raw_text)]
|
||||||
else:
|
else:
|
||||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
return [Doc(nlp.vocab,
|
||||||
|
words=add_noise(sent_tuples[1], noise_level))
|
||||||
for (sent_tuples, brackets) in paragraph_tuples]
|
for (sent_tuples, brackets) in paragraph_tuples]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, docs, paragraph_tuples):
|
def _make_golds(cls, docs, paragraph_tuples):
|
||||||
assert len(docs) == len(paragraph_tuples)
|
assert len(docs) == len(paragraph_tuples)
|
||||||
if len(docs) == 1:
|
if len(docs) == 1:
|
||||||
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
|
return [GoldParse.from_annot_tuples(docs[0],
|
||||||
|
paragraph_tuples[0][0])]
|
||||||
else:
|
else:
|
||||||
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||||
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
|
for doc, (sent_tuples, brackets)
|
||||||
|
in zip(docs, paragraph_tuples)]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def walk_corpus(path):
|
def walk_corpus(path):
|
||||||
|
@ -330,9 +335,9 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for i, token in enumerate(sent['tokens']):
|
for i, token in enumerate(sent['tokens']):
|
||||||
words.append(token['orth'])
|
words.append(token['orth'])
|
||||||
ids.append(i)
|
ids.append(i)
|
||||||
tags.append(token.get('tag','-'))
|
tags.append(token.get('tag', '-'))
|
||||||
heads.append(token.get('head',0) + i)
|
heads.append(token.get('head', 0) + i)
|
||||||
labels.append(token.get('dep',''))
|
labels.append(token.get('dep', ''))
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == 'root':
|
if labels[-1].lower() == 'root':
|
||||||
labels[-1] = 'ROOT'
|
labels[-1] = 'ROOT'
|
||||||
|
@ -382,19 +387,21 @@ cdef class GoldParse:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
||||||
_, words, tags, heads, deps, entities = annot_tuples
|
_, words, tags, heads, deps, entities = annot_tuples
|
||||||
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
|
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
|
||||||
make_projective=make_projective)
|
entities=entities, make_projective=make_projective)
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None,
|
||||||
deps=None, entities=None, make_projective=False,
|
heads=None, deps=None, entities=None, make_projective=False,
|
||||||
cats=None):
|
cats=None):
|
||||||
"""Create a GoldParse.
|
"""Create a GoldParse.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
words (iterable): A sequence of unicode word strings.
|
words (iterable): A sequence of unicode word strings.
|
||||||
tags (iterable): A sequence of strings, representing tag annotations.
|
tags (iterable): A sequence of strings, representing tag annotations.
|
||||||
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
heads (iterable): A sequence of integers, representing syntactic
|
||||||
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
head offsets.
|
||||||
|
deps (iterable): A sequence of strings, representing the syntactic
|
||||||
|
relation types.
|
||||||
entities (iterable): A sequence of named entity annotations, either as
|
entities (iterable): A sequence of named entity annotations, either as
|
||||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||||
representing the entity positions.
|
representing the entity positions.
|
||||||
|
@ -404,9 +411,10 @@ cdef class GoldParse:
|
||||||
document (usually a sentence). Unlike entity annotations, label
|
document (usually a sentence). Unlike entity annotations, label
|
||||||
annotations can overlap, i.e. a single word can be covered by
|
annotations can overlap, i.e. a single word can be covered by
|
||||||
multiple labelled spans. The TextCategorizer component expects
|
multiple labelled spans. The TextCategorizer component expects
|
||||||
true examples of a label to have the value 1.0, and negative examples
|
true examples of a label to have the value 1.0, and negative
|
||||||
of a label to have the value 0.0. Labels not in the dictionary are
|
examples of a label to have the value 0.0. Labels not in the
|
||||||
treated as missing -- the gradient for those labels will be zero.
|
dictionary are treated as missing - the gradient for those labels
|
||||||
|
will be zero.
|
||||||
RETURNS (GoldParse): The newly constructed object.
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if words is None:
|
if words is None:
|
||||||
|
@ -470,11 +478,11 @@ cdef class GoldParse:
|
||||||
self.ner[i] = entities[gold_i]
|
self.ner[i] = entities[gold_i]
|
||||||
|
|
||||||
cycle = nonproj.contains_cycle(self.heads)
|
cycle = nonproj.contains_cycle(self.heads)
|
||||||
if cycle != None:
|
if cycle is not None:
|
||||||
raise Exception("Cycle found: %s" % cycle)
|
raise Exception("Cycle found: %s" % cycle)
|
||||||
|
|
||||||
if make_projective:
|
if make_projective:
|
||||||
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
|
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
|
||||||
self.heads = proj_heads
|
self.heads = proj_heads
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
@ -497,20 +505,19 @@ cdef class GoldParse:
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
"""Encode labelled spans into per-token tags, using the
|
||||||
scheme (BILUO).
|
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||||
|
|
||||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||||
will refer to the token boundaries within the document.
|
will refer to the token boundaries within the document.
|
||||||
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
||||||
`end` should be character-offset integers denoting the slice into the
|
and `end` should be character-offset integers denoting the slice into
|
||||||
original string.
|
the original string.
|
||||||
|
|
||||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||||
string will be of the form either "", "O" or "{action}-{label}", where
|
string will be of the form either "", "O" or "{action}-{label}", where
|
||||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||||
entity offsets don't align with the tokenization in the `Doc` object. The
|
entity offsets don't align with the tokenization in the `Doc` object.
|
||||||
training algorithm will view these as missing values. "O" denotes a
|
The training algorithm will view these as missing values. "O" denotes a
|
||||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||||
|
|
Loading…
Reference in New Issue