mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
c9ec24b257
|
@ -62,10 +62,10 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
|
|||
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
||||
for doc, gold in epoch:
|
||||
trainer.update(doc, gold)
|
||||
dev_scores = trainer.evaluate(dev_data) if dev_data else []
|
||||
dev_scores = trainer.evaluate(dev_data).scores if dev_data else {}
|
||||
print_progress(itn, trainer.nlp.parser.model.nr_weight,
|
||||
trainer.nlp.parser.model.nr_active_feat,
|
||||
**dev_scores.scores)
|
||||
**dev_scores)
|
||||
|
||||
|
||||
def evaluate(Language, gold_tuples, output_path):
|
||||
|
|
|
@ -98,6 +98,17 @@ p
|
|||
| so that Python functions can be used to help you generalise and combine
|
||||
| the data as you require.
|
||||
|
||||
+infobox("For languages with non-latin characters")
|
||||
| In order for the tokenizer to split suffixes, prefixes and infixes, spaCy
|
||||
| needs to know the language's character set. If the language you're adding
|
||||
| uses non-latin characters, you might need to add the required character
|
||||
| classes to the global
|
||||
| #[+src(gh("spacy", "spacy/language_data/punctuation.py")) punctuation.py].
|
||||
| spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library]
|
||||
| to keep this simple and readable. If the language requires very specific
|
||||
| punctuation rules, you should consider overwriting the default regular
|
||||
| expressions with your own in the language's #[code Defaults].
|
||||
|
||||
+h(3, "stop-words") Stop words
|
||||
|
||||
p
|
||||
|
|
Loading…
Reference in New Issue