mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
664cfc29bc
|
@ -36,12 +36,13 @@ from ..compat import json_dumps
|
|||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path))
|
||||
"overwritten.", "option", "m", Path),
|
||||
verbose=("Display more information for debug", "option", None, bool))
|
||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
parser_multitasks='', entity_multitasks='',
|
||||
use_gpu=-1, vectors=None, no_tagger=False,
|
||||
no_parser=False, no_entities=False, gold_preproc=False,
|
||||
version="0.0.0", meta_path=None):
|
||||
version="0.0.0", meta_path=None, verbose=False):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -143,7 +144,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
gold_preproc=gold_preproc))
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
|
|
|
@ -30,7 +30,7 @@ def tags_to_entities(tags):
|
|||
continue
|
||||
elif tag.startswith('I'):
|
||||
if start is None:
|
||||
raise ValueError(Errors.E067.format(tags=tags[:i]))
|
||||
raise ValueError(Errors.E067.format(tags=tags[:i+1]))
|
||||
continue
|
||||
if tag.startswith('U'):
|
||||
entities.append((tag[2:], i, i))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -78,5 +78,11 @@ for orth in [
|
|||
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
||||
# should be tokenized as two separate tokens.
|
||||
for orth in ["i", "m"]:
|
||||
_exc[orth + "."] = [
|
||||
{ORTH: orth, LEMMA: orth, NORM: orth},
|
||||
{ORTH: ".", TAG: PUNCT}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -6,7 +6,8 @@ import pytest
|
|||
|
||||
SV_TOKEN_EXCEPTION_TESTS = [
|
||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
||||
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -260,7 +260,7 @@ p
|
|||
+code(false, "bash", "$", false, false, true).
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
|
||||
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
|
||||
[--no-entities] [--gold-preproc]
|
||||
[--no-entities] [--gold-preproc] [--verbose]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -344,6 +344,11 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row
|
||||
+cell #[code --verbose]
|
||||
+cell flag
|
||||
+cell Show more detail message during training.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell model, pickle
|
||||
|
|
Loading…
Reference in New Issue