From 33ba9ff464f52994b4eda37983426556cb33752b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 29 Oct 2019 13:16:55 +0100 Subject: [PATCH] set encodings explicitly to utf8 (#4551) --- bin/ud/ud_run_test.py | 2 +- bin/ud/ud_train.py | 10 +++++----- spacy/util.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py index de01cf350..7cb270d84 100644 --- a/bin/ud/ud_run_test.py +++ b/bin/ud/ud_run_test.py @@ -84,7 +84,7 @@ def read_conllu(file_): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): if text_loc.parts[-1].endswith(".conllu"): docs = [] - with text_loc.open() as file_: + with text_loc.open(encoding="utf8") as file_: for conllu_doc in read_conllu(file_): for conllu_sent in conllu_doc: words = [line[1] for line in conllu_sent] diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 5d4f20d6e..945bf57eb 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -203,7 +203,7 @@ def golds_to_gold_tuples(docs, golds): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): if text_loc.parts[-1].endswith(".conllu"): docs = [] - with text_loc.open() as file_: + with text_loc.open(encoding="utf8") as file_: for conllu_doc in read_conllu(file_): for conllu_sent in conllu_doc: words = [line[1] for line in conllu_sent] @@ -378,7 +378,7 @@ def _load_pretrained_tok2vec(nlp, loc): """Load pretrained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ - with Path(loc).open("rb") as file_: + with Path(loc).open("rb", encoding="utf8") as file_: weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: @@ -519,8 +519,8 @@ def main( for i in range(config.nr_epoch): docs, golds = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, oracle_segments=use_oracle_segments, @@ -560,7 +560,7 @@ def main( def _render_parses(i, to_render): to_render[0].user_data["title"] = "Batch %d" % i - with Path("/tmp/parses.html").open("w") as file_: + with Path("/tmp/parses.html").open("w", encoding="utf8") as file_: html = displacy.render(to_render[:5], style="dep", page=True) file_.write(html) diff --git a/spacy/util.py b/spacy/util.py index ffc25fb9d..74e4cc1c6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -421,7 +421,7 @@ def env_opt(name, default=None): def read_regex(path): path = ensure_path(path) - with path.open() as file_: + with path.open(encoding="utf8") as file_: entries = file_.read().split("\n") expression = "|".join( ["^" + re.escape(piece) for piece in entries if piece.strip()]