From 4ec76232880066e36ad9b613f934dd7dc66404ea Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 4 Nov 2019 20:31:26 +0100 Subject: [PATCH] Fix conllu script (#4579) * force extensions to avoid clash between example scripts * fix arg order and default file encoding * add example config for conllu script * newline * move extension definitions to main function * few more encodings fixes --- bin/ud/ud_train.py | 22 +++++++++------------- examples/training/conllu-config.json | 1 + examples/training/conllu.py | 22 ++++++++++------------ 3 files changed, 20 insertions(+), 25 deletions(-) create mode 100644 examples/training/conllu-config.json diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 945bf57eb..2784d7c3c 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -7,7 +7,6 @@ from __future__ import unicode_literals import plac from pathlib import Path import re -import sys import json import spacy @@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy -from collections import defaultdict, Counter -from timeit import default_timer as timer +from collections import defaultdict -import itertools import random -import numpy.random from spacy import lang from spacy.lang import zh @@ -323,10 +319,6 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True) -Token.set_extension("begins_fused", default=False, force=True) -Token.set_extension("inside_fused", default=False, force=True) - ################## # Initialization # @@ -459,13 +451,13 @@ class TreebankPaths(object): @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), + parses_dir=("Directory to write the development parses", "positional", None, Path), corpus=( - "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", "positional", None, str, ), - parses_dir=("Directory to write the development parses", "positional", None, Path), config=("Path to json formatted config file", "option", "C", Path), limit=("Size limit", "option", "n", int), gpu_device=("Use GPU", "option", "g", int), @@ -490,6 +482,10 @@ def main( # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False @@ -506,8 +502,8 @@ def main( docs, golds = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, ) diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json new file mode 100644 index 000000000..9a11dd96b --- /dev/null +++ b/examples/training/conllu-config.json @@ -0,0 +1 @@ +{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0} diff --git a/examples/training/conllu.py b/examples/training/conllu.py index dfc790456..d9ee721ec 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -13,8 +13,7 @@ import spacy.util from spacy.tokens import Token, Doc from spacy.gold import GoldParse from spacy.syntax.nonproj import projectivize -from collections import defaultdict, Counter -from timeit import default_timer as timer +from collections import defaultdict from spacy.matcher import Matcher import itertools @@ -290,11 +289,6 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu) -Token.set_extension("begins_fused", default=False) -Token.set_extension("inside_fused", default=False) - - ################## # Initialization # ################## @@ -381,20 +375,24 @@ class TreebankPaths(object): @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), + parses_dir=("Directory to write the development parses", "positional", None, Path), + config=("Path to json formatted config file", "positional", None, Config.load), corpus=( - "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", "positional", None, str, ), - parses_dir=("Directory to write the development parses", "positional", None, Path), - config=("Path to json formatted config file", "positional", None, Config.load), limit=("Size limit", "option", "n", int), ) def main(ud_dir, parses_dir, config, corpus, limit=0): # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() @@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): docs, golds = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, )