mirror of https://github.com/explosion/spaCy.git
Update API docs with changes in spacy.gold and spacy.language
This commit is contained in:
parent
b5fb43fdd8
commit
54f04a9fe0
|
@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words):
|
|||
|
||||
|
||||
class GoldCorpus(object):
|
||||
'''An annotated corpus, using the JSON file format. Manages
|
||||
annotations for tagging, dependency parsing, NER.'''
|
||||
"""An annotated corpus, using the JSON file format. Manages
|
||||
annotations for tagging, dependency parsing and NER."""
|
||||
def __init__(self, train_path, dev_path):
|
||||
"""Create a GoldCorpus.
|
||||
|
||||
train_path (unicode or Path): File or directory of training data.
|
||||
dev_path (unicode or Path): File or directory of development data.
|
||||
"""
|
||||
self.train_path = util.ensure_path(train_path)
|
||||
self.dev_path = util.ensure_path(dev_path)
|
||||
self.train_locs = self.walk_corpus(self.train_path)
|
||||
|
|
|
@ -236,6 +236,12 @@ class Language(object):
|
|||
doc.tensor = None
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
"""Can be called before training to pre-process gold data. By default,
|
||||
it handles nonprojectivity and adds missing tags to the tag map.
|
||||
|
||||
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
||||
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
||||
"""
|
||||
for proc in self.pipeline:
|
||||
if hasattr(proc, 'preprocess_gold'):
|
||||
docs_golds = proc.preprocess_gold(docs_golds)
|
||||
|
|
|
@ -23,7 +23,8 @@
|
|||
"Lexeme": "lexeme",
|
||||
"Vocab": "vocab",
|
||||
"StringStore": "stringstore",
|
||||
"GoldParse": "goldparse"
|
||||
"GoldParse": "goldparse",
|
||||
"GoldCorpus": "goldcorpus"
|
||||
},
|
||||
"Other": {
|
||||
"Annotation Specs": "annotation",
|
||||
|
@ -135,6 +136,11 @@
|
|||
"tag": "class"
|
||||
},
|
||||
|
||||
"goldcorpus": {
|
||||
"title": "GoldCorpus",
|
||||
"tag": "class"
|
||||
},
|
||||
|
||||
"annotation": {
|
||||
"title": "Annotation Specifications"
|
||||
},
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
//- 💫 DOCS > API > GOLDCORPUS
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| An annotated corpus, using the JSON file format. Manages annotations for
|
||||
| tagging, dependency parsing and NER.
|
||||
|
||||
+h(2, "init") GoldCorpus.__init__
|
||||
+tag method
|
||||
|
||||
p Create a #[code GoldCorpus].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code train_path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell File or directory of training data.
|
||||
|
||||
+row
|
||||
+cell #[code dev_path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell File or directory of development data.
|
|
@ -7,7 +7,7 @@ p Collection for training annotations.
|
|||
+h(2, "init") GoldParse.__init__
|
||||
+tag method
|
||||
|
||||
p Create a GoldParse.
|
||||
p Create a #[code GoldParse].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
|
|
@ -82,6 +82,41 @@ p
|
|||
+cell #[code Doc]
|
||||
+cell A container for accessing the annotations.
|
||||
|
||||
+h(2, "pipe") Language.pipe
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Process texts as a stream, and yield #[code Doc] objects in order.
|
||||
| Supports GIL-free multi-threading.
|
||||
|
||||
+aside-code("Example").
|
||||
texts = [u'One document.', u'...', u'Lots of documents']
|
||||
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||
assert doc.is_parsed
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code texts]
|
||||
+cell -
|
||||
+cell A sequence of unicode objects.
|
||||
|
||||
+row
|
||||
+cell #[code n_threads]
|
||||
+cell int
|
||||
+cell
|
||||
| The number of worker threads to use. If #[code -1], OpenMP will
|
||||
| decide how many to use at run time. Default is #[code 2].
|
||||
|
||||
+row
|
||||
+cell #[code batch_size]
|
||||
+cell int
|
||||
+cell The number of texts to buffer.
|
||||
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Documents in the order of the original text.
|
||||
|
||||
+h(2, "update") Language.update
|
||||
+tag method
|
||||
|
||||
|
@ -172,40 +207,23 @@ p
|
|||
+cell -
|
||||
+cell Config parameters.
|
||||
|
||||
+h(2, "pipe") Language.pipe
|
||||
+tag method
|
||||
+h(2, "preprocess_gold") Language.preprocess_gold
|
||||
|
||||
p
|
||||
| Process texts as a stream, and yield #[code Doc] objects in order.
|
||||
| Supports GIL-free multi-threading.
|
||||
| Can be called before training to pre-process gold data. By default, it
|
||||
| handles nonprojectivity and adds missing tags to the tag map.
|
||||
|
||||
+aside-code("Example").
|
||||
texts = [u'One document.', u'...', u'Lots of documents']
|
||||
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||
assert doc.is_parsed
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code texts]
|
||||
+cell -
|
||||
+cell A sequence of unicode objects.
|
||||
|
||||
+row
|
||||
+cell #[code n_threads]
|
||||
+cell int
|
||||
+cell
|
||||
| The number of worker threads to use. If #[code -1], OpenMP will
|
||||
| decide how many to use at run time. Default is #[code 2].
|
||||
|
||||
+row
|
||||
+cell #[code batch_size]
|
||||
+cell int
|
||||
+cell The number of texts to buffer.
|
||||
+cell #[code docs_golds]
|
||||
+cell iterable
|
||||
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
||||
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Documents in the order of the original text.
|
||||
+cell tuple
|
||||
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
||||
|
||||
+h(2, "to_disk") Language.to_disk
|
||||
+tag method
|
||||
|
|
Loading…
Reference in New Issue