Add GoldCorpus class, to manage data streaming

2017-05-21 09:06:17 -05:00 · 2017-05-21 09:06:17 -05:00 · 4803b3b69e
parent 180e5afede
commit 4803b3b69e
1 changed files with 85 additions and 4 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -5,10 +5,12 @@ from __future__ import unicode_literals, print_function
 import io
 import re
 import ujson
 import random
 from .syntax import nonproj
 from .util import ensure_path
 from . import util
 from .tokens import Doc
 def tags_to_entities(tags):
@ -139,8 +141,89 @@ def _min_edit_path(cand_words, gold_words):
    return prev_costs[n_gold], previous_row[-1]
-def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
+class GoldCorpus(object):
-    make_supertags = util.env_opt('make_supertags', make_supertags)
+    '''An annotated corpus, using the JSON file format. Manages
    annotations for tagging, dependency parsing, NER.'''
    def __init__(self, train_path, dev_path):
        self.train_path = util.ensure_path(train_path)
        self.dev_path = util.ensure_path(dev_path)
        self.train_locs = self.walk_corpus(self.train_path)
        self.dev_locs = self.walk_corpus(self.train_path)
    @property
    def train_tuples(self):
        for loc in self.train_locs:
            gold_tuples = read_json_file(loc)
            yield from gold_tuples
    @property
    def dev_tuples(self):
        for loc in self.dev_locs:
            gold_tuples = read_json_file(loc)
            yield from gold_tuples
    def count_train(self):
        n = 0
        for _ in self.train_tuples:
            n += 1
        return n
    def train_docs(self, nlp, shuffle=0):
        if shuffle:
            random.shuffle(self.train_locs)
        gold_docs = self.iter_gold_docs(nlp, self.train_tuples)
        if shuffle:
            gold_docs = util.itershuffle(gold_docs, bufsize=shuffle*5000)
        yield from gold_docs
    def dev_docs(self, nlp):
        yield from self.iter_gold_docs(nlp, self.dev_tuples)
    @classmethod
    def iter_gold_docs(cls, nlp, tuples):
        for raw_text, paragraph_tuples in tuples:
            docs = cls._make_docs(nlp, raw_text, paragraph_tuples)
            golds = cls._make_golds(docs, paragraph_tuples)
            for doc, gold in zip(docs, golds):
                yield doc, gold
    @classmethod
    def _make_docs(cls, nlp, raw_text, paragraph_tuples):
        if raw_text is not None:
            return [nlp.make_doc(raw_text)]
        else:
            return [
                Doc(nlp.vocab, words=sent_tuples[0][1])
                for sent_tuples in paragraph_tuples]
    @classmethod
    def _make_golds(cls, docs, paragraph_tuples):
        if len(docs) == 1:
            return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
                    for sent_tuples in paragraph_tuples]
        else:
            return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
                    for doc, sent_tuples in zip(docs, paragraph_tuples)]
    @staticmethod
    def walk_corpus(path):
        locs = []
        paths = [path]
        seen = set()
        for path in paths:
            if str(path) in seen:
                continue
            seen.add(str(path))
            if path.parts[-1].startswith('.'):
                continue
            elif path.is_dir():
                paths.extend(path.iterdir())
            elif path.parts[-1].endswith('.json'):
                locs.append(path)
        return locs
 def read_json_file(loc, docs_filter=None, limit=None):
    loc = ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
@ -173,8 +256,6 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
                        if make_supertags:
                            tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1]))
                    sents.append([
                        [ids, words, tags, heads, labels, ner],
                        sent.get('brackets', [])])