diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index 5e5f94622..22364bb78 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -264,50 +264,51 @@ def test_pretraining_tagger(): pretrain(filled, tmp_dir) -def test_pretraining_training(): - """Test that training can use a pretrained Tok2Vec model""" - config = Config().from_str(pretrain_string_internal) - nlp = util.load_model_from_config(config, auto_fill=True, validate=False) - filled = nlp.config - pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) - filled = pretrain_config.merge(filled) - train_config = util.load_config(DEFAULT_CONFIG_PATH) - filled = train_config.merge(filled) - with make_tempdir() as tmp_dir: - pretrain_dir = tmp_dir / "pretrain" - pretrain_dir.mkdir() - file_path = write_sample_jsonl(pretrain_dir) - filled["paths"]["raw_text"] = file_path - filled["pretraining"]["component"] = "tagger" - filled["pretraining"]["layer"] = "tok2vec" - train_dir = tmp_dir / "train" - train_dir.mkdir() - train_path, dev_path = write_sample_training(train_dir) - filled["paths"]["train"] = train_path - filled["paths"]["dev"] = dev_path - filled = filled.interpolate() - P = filled["pretraining"] - nlp_base = init_nlp(filled) - model_base = ( - nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") - ) - embed_base = None - for node in model_base.walk(): - if node.name == "hashembed": - embed_base = node - pretrain(filled, pretrain_dir) - pretrained_model = Path(pretrain_dir / "model3.bin") - assert pretrained_model.exists() - filled["initialize"]["init_tok2vec"] = str(pretrained_model) - nlp = init_nlp(filled) - model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") - embed = None - for node in model.walk(): - if node.name == "hashembed": - embed = node - # ensure that the tok2vec weights are actually changed by the pretraining - assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E"))) - train(nlp, train_dir) +# Try to debug segfault on windows +#def test_pretraining_training(): +# """Test that training can use a pretrained Tok2Vec model""" +# config = Config().from_str(pretrain_string_internal) +# nlp = util.load_model_from_config(config, auto_fill=True, validate=False) +# filled = nlp.config +# pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) +# filled = pretrain_config.merge(filled) +# train_config = util.load_config(DEFAULT_CONFIG_PATH) +# filled = train_config.merge(filled) +# with make_tempdir() as tmp_dir: +# pretrain_dir = tmp_dir / "pretrain" +# pretrain_dir.mkdir() +# file_path = write_sample_jsonl(pretrain_dir) +# filled["paths"]["raw_text"] = file_path +# filled["pretraining"]["component"] = "tagger" +# filled["pretraining"]["layer"] = "tok2vec" +# train_dir = tmp_dir / "train" +# train_dir.mkdir() +# train_path, dev_path = write_sample_training(train_dir) +# filled["paths"]["train"] = train_path +# filled["paths"]["dev"] = dev_path +# filled = filled.interpolate() +# P = filled["pretraining"] +# nlp_base = init_nlp(filled) +# model_base = ( +# nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") +# ) +# embed_base = None +# for node in model_base.walk(): +# if node.name == "hashembed": +# embed_base = node +# pretrain(filled, pretrain_dir) +# pretrained_model = Path(pretrain_dir / "model3.bin") +# assert pretrained_model.exists() +# filled["initialize"]["init_tok2vec"] = str(pretrained_model) +# nlp = init_nlp(filled) +# model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") +# embed = None +# for node in model.walk(): +# if node.name == "hashembed": +# embed = node +# # ensure that the tok2vec weights are actually changed by the pretraining +# assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E"))) +# train(nlp, train_dir) def write_sample_jsonl(tmp_dir):