diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3900c7f39..8557019c6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -62,10 +62,10 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data) if dev_data else [] + dev_scores = trainer.evaluate(dev_data).scores if dev_data else {} print_progress(itn, trainer.nlp.parser.model.nr_weight, trainer.nlp.parser.model.nr_active_feat, - **dev_scores.scores) + **dev_scores) def evaluate(Language, gold_tuples, output_path): diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 0c98cc5ca..50b626b99 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -98,6 +98,17 @@ p | so that Python functions can be used to help you generalise and combine | the data as you require. ++infobox("For languages with non-latin characters") + | In order for the tokenizer to split suffixes, prefixes and infixes, spaCy + | needs to know the language's character set. If the language you're adding + | uses non-latin characters, you might need to add the required character + | classes to the global + | #[+src(gh("spacy", "spacy/language_data/punctuation.py")) punctuation.py]. + | spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library] + | to keep this simple and readable. If the language requires very specific + | punctuation rules, you should consider overwriting the default regular + | expressions with your own in the language's #[code Defaults]. + +h(3, "stop-words") Stop words p