mirror of https://github.com/explosion/spaCy.git
Add GoldCorpus class, to manage data streaming
This commit is contained in:
parent
180e5afede
commit
4803b3b69e
|
@ -5,10 +5,12 @@ from __future__ import unicode_literals, print_function
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import ujson
|
import ujson
|
||||||
|
import random
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .util import ensure_path
|
from .util import ensure_path
|
||||||
from . import util
|
from . import util
|
||||||
|
from .tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
@ -139,8 +141,89 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
return prev_costs[n_gold], previous_row[-1]
|
return prev_costs[n_gold], previous_row[-1]
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
|
class GoldCorpus(object):
|
||||||
make_supertags = util.env_opt('make_supertags', make_supertags)
|
'''An annotated corpus, using the JSON file format. Manages
|
||||||
|
annotations for tagging, dependency parsing, NER.'''
|
||||||
|
def __init__(self, train_path, dev_path):
|
||||||
|
self.train_path = util.ensure_path(train_path)
|
||||||
|
self.dev_path = util.ensure_path(dev_path)
|
||||||
|
self.train_locs = self.walk_corpus(self.train_path)
|
||||||
|
self.dev_locs = self.walk_corpus(self.train_path)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def train_tuples(self):
|
||||||
|
for loc in self.train_locs:
|
||||||
|
gold_tuples = read_json_file(loc)
|
||||||
|
yield from gold_tuples
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dev_tuples(self):
|
||||||
|
for loc in self.dev_locs:
|
||||||
|
gold_tuples = read_json_file(loc)
|
||||||
|
yield from gold_tuples
|
||||||
|
|
||||||
|
def count_train(self):
|
||||||
|
n = 0
|
||||||
|
for _ in self.train_tuples:
|
||||||
|
n += 1
|
||||||
|
return n
|
||||||
|
|
||||||
|
def train_docs(self, nlp, shuffle=0):
|
||||||
|
if shuffle:
|
||||||
|
random.shuffle(self.train_locs)
|
||||||
|
gold_docs = self.iter_gold_docs(nlp, self.train_tuples)
|
||||||
|
if shuffle:
|
||||||
|
gold_docs = util.itershuffle(gold_docs, bufsize=shuffle*5000)
|
||||||
|
yield from gold_docs
|
||||||
|
|
||||||
|
def dev_docs(self, nlp):
|
||||||
|
yield from self.iter_gold_docs(nlp, self.dev_tuples)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def iter_gold_docs(cls, nlp, tuples):
|
||||||
|
for raw_text, paragraph_tuples in tuples:
|
||||||
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples)
|
||||||
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
|
for doc, gold in zip(docs, golds):
|
||||||
|
yield doc, gold
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _make_docs(cls, nlp, raw_text, paragraph_tuples):
|
||||||
|
if raw_text is not None:
|
||||||
|
return [nlp.make_doc(raw_text)]
|
||||||
|
else:
|
||||||
|
return [
|
||||||
|
Doc(nlp.vocab, words=sent_tuples[0][1])
|
||||||
|
for sent_tuples in paragraph_tuples]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _make_golds(cls, docs, paragraph_tuples):
|
||||||
|
if len(docs) == 1:
|
||||||
|
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
|
||||||
|
for sent_tuples in paragraph_tuples]
|
||||||
|
else:
|
||||||
|
return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
|
||||||
|
for doc, sent_tuples in zip(docs, paragraph_tuples)]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def walk_corpus(path):
|
||||||
|
locs = []
|
||||||
|
paths = [path]
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith('.'):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif path.parts[-1].endswith('.json'):
|
||||||
|
locs.append(path)
|
||||||
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
|
@ -173,8 +256,6 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
|
||||||
if labels[-1].lower() == 'root':
|
if labels[-1].lower() == 'root':
|
||||||
labels[-1] = 'ROOT'
|
labels[-1] = 'ROOT'
|
||||||
ner.append(token.get('ner', '-'))
|
ner.append(token.get('ner', '-'))
|
||||||
if make_supertags:
|
|
||||||
tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1]))
|
|
||||||
sents.append([
|
sents.append([
|
||||||
[ids, words, tags, heads, labels, ner],
|
[ids, words, tags, heads, labels, ner],
|
||||||
sent.get('brackets', [])])
|
sent.get('brackets', [])])
|
||||||
|
|
Loading…
Reference in New Issue