diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 45b95b379..bc34290f4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words): class GoldCorpus(object): - '''An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing, NER.''' + """An annotated corpus, using the JSON file format. Manages + annotations for tagging, dependency parsing and NER.""" def __init__(self, train_path, dev_path): + """Create a GoldCorpus. + + train_path (unicode or Path): File or directory of training data. + dev_path (unicode or Path): File or directory of development data. + """ self.train_path = util.ensure_path(train_path) self.dev_path = util.ensure_path(dev_path) self.train_locs = self.walk_corpus(self.train_path) diff --git a/spacy/language.py b/spacy/language.py index 58cee80ac..37f7ae207 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -236,6 +236,12 @@ class Language(object): doc.tensor = None def preprocess_gold(self, docs_golds): + """Can be called before training to pre-process gold data. By default, + it handles nonprojectivity and adds missing tags to the tag map. + + docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. + YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. + """ for proc in self.pipeline: if hasattr(proc, 'preprocess_gold'): docs_golds = proc.preprocess_gold(docs_golds) diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 900a42553..443ee9a67 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -23,7 +23,8 @@ "Lexeme": "lexeme", "Vocab": "vocab", "StringStore": "stringstore", - "GoldParse": "goldparse" + "GoldParse": "goldparse", + "GoldCorpus": "goldcorpus" }, "Other": { "Annotation Specs": "annotation", @@ -135,6 +136,11 @@ "tag": "class" }, + "goldcorpus": { + "title": "GoldCorpus", + "tag": "class" + }, + "annotation": { "title": "Annotation Specifications" }, diff --git a/website/docs/api/goldcorpus.jade b/website/docs/api/goldcorpus.jade new file mode 100644 index 000000000..bfff92ad5 --- /dev/null +++ b/website/docs/api/goldcorpus.jade @@ -0,0 +1,23 @@ +//- 💫 DOCS > API > GOLDCORPUS + +include ../../_includes/_mixins + +p + | An annotated corpus, using the JSON file format. Manages annotations for + | tagging, dependency parsing and NER. + ++h(2, "init") GoldCorpus.__init__ + +tag method + +p Create a #[code GoldCorpus]. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code train_path] + +cell unicode or #[code Path] + +cell File or directory of training data. + + +row + +cell #[code dev_path] + +cell unicode or #[code Path] + +cell File or directory of development data. diff --git a/website/docs/api/goldparse.jade b/website/docs/api/goldparse.jade index f39558b35..7818912c3 100644 --- a/website/docs/api/goldparse.jade +++ b/website/docs/api/goldparse.jade @@ -7,7 +7,7 @@ p Collection for training annotations. +h(2, "init") GoldParse.__init__ +tag method -p Create a GoldParse. +p Create a #[code GoldParse]. +table(["Name", "Type", "Description"]) +row diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 7f6e0829d..455165bca 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -82,6 +82,41 @@ p +cell #[code Doc] +cell A container for accessing the annotations. ++h(2, "pipe") Language.pipe + +tag method + +p + | Process texts as a stream, and yield #[code Doc] objects in order. + | Supports GIL-free multi-threading. + ++aside-code("Example"). + texts = [u'One document.', u'...', u'Lots of documents'] + for doc in nlp.pipe(texts, batch_size=50, n_threads=4): + assert doc.is_parsed + ++table(["Name", "Type", "Description"]) + +row + +cell #[code texts] + +cell - + +cell A sequence of unicode objects. + + +row + +cell #[code n_threads] + +cell int + +cell + | The number of worker threads to use. If #[code -1], OpenMP will + | decide how many to use at run time. Default is #[code 2]. + + +row + +cell #[code batch_size] + +cell int + +cell The number of texts to buffer. + + +footrow + +cell yields + +cell #[code Doc] + +cell Documents in the order of the original text. + +h(2, "update") Language.update +tag method @@ -172,40 +207,23 @@ p +cell - +cell Config parameters. -+h(2, "pipe") Language.pipe - +tag method ++h(2, "preprocess_gold") Language.preprocess_gold p - | Process texts as a stream, and yield #[code Doc] objects in order. - | Supports GIL-free multi-threading. + | Can be called before training to pre-process gold data. By default, it + | handles nonprojectivity and adds missing tags to the tag map. -+aside-code("Example"). - texts = [u'One document.', u'...', u'Lots of documents'] - for doc in nlp.pipe(texts, batch_size=50, n_threads=4): - assert doc.is_parsed +table(["Name", "Type", "Description"]) +row - +cell #[code texts] - +cell - - +cell A sequence of unicode objects. - - +row - +cell #[code n_threads] - +cell int - +cell - | The number of worker threads to use. If #[code -1], OpenMP will - | decide how many to use at run time. Default is #[code 2]. - - +row - +cell #[code batch_size] - +cell int - +cell The number of texts to buffer. + +cell #[code docs_golds] + +cell iterable + +cell Tuples of #[code Doc] and #[code GoldParse] objects. +footrow +cell yields - +cell #[code Doc] - +cell Documents in the order of the original text. + +cell tuple + +cell Tuples of #[code Doc] and #[code GoldParse] objects. +h(2, "to_disk") Language.to_disk +tag method