Try to debug segfault

This commit is contained in:
Matthew Honnibal 2024-12-10 19:27:16 +01:00
parent 18f23b5ad7
commit 1a4d21ccd5
1 changed files with 45 additions and 44 deletions

View File

@ -264,50 +264,51 @@ def test_pretraining_tagger():
pretrain(filled, tmp_dir) pretrain(filled, tmp_dir)
def test_pretraining_training(): # Try to debug segfault on windows
"""Test that training can use a pretrained Tok2Vec model""" #def test_pretraining_training():
config = Config().from_str(pretrain_string_internal) # """Test that training can use a pretrained Tok2Vec model"""
nlp = util.load_model_from_config(config, auto_fill=True, validate=False) # config = Config().from_str(pretrain_string_internal)
filled = nlp.config # nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) # filled = nlp.config
filled = pretrain_config.merge(filled) # pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
train_config = util.load_config(DEFAULT_CONFIG_PATH) # filled = pretrain_config.merge(filled)
filled = train_config.merge(filled) # train_config = util.load_config(DEFAULT_CONFIG_PATH)
with make_tempdir() as tmp_dir: # filled = train_config.merge(filled)
pretrain_dir = tmp_dir / "pretrain" # with make_tempdir() as tmp_dir:
pretrain_dir.mkdir() # pretrain_dir = tmp_dir / "pretrain"
file_path = write_sample_jsonl(pretrain_dir) # pretrain_dir.mkdir()
filled["paths"]["raw_text"] = file_path # file_path = write_sample_jsonl(pretrain_dir)
filled["pretraining"]["component"] = "tagger" # filled["paths"]["raw_text"] = file_path
filled["pretraining"]["layer"] = "tok2vec" # filled["pretraining"]["component"] = "tagger"
train_dir = tmp_dir / "train" # filled["pretraining"]["layer"] = "tok2vec"
train_dir.mkdir() # train_dir = tmp_dir / "train"
train_path, dev_path = write_sample_training(train_dir) # train_dir.mkdir()
filled["paths"]["train"] = train_path # train_path, dev_path = write_sample_training(train_dir)
filled["paths"]["dev"] = dev_path # filled["paths"]["train"] = train_path
filled = filled.interpolate() # filled["paths"]["dev"] = dev_path
P = filled["pretraining"] # filled = filled.interpolate()
nlp_base = init_nlp(filled) # P = filled["pretraining"]
model_base = ( # nlp_base = init_nlp(filled)
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") # model_base = (
) # nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
embed_base = None # )
for node in model_base.walk(): # embed_base = None
if node.name == "hashembed": # for node in model_base.walk():
embed_base = node # if node.name == "hashembed":
pretrain(filled, pretrain_dir) # embed_base = node
pretrained_model = Path(pretrain_dir / "model3.bin") # pretrain(filled, pretrain_dir)
assert pretrained_model.exists() # pretrained_model = Path(pretrain_dir / "model3.bin")
filled["initialize"]["init_tok2vec"] = str(pretrained_model) # assert pretrained_model.exists()
nlp = init_nlp(filled) # filled["initialize"]["init_tok2vec"] = str(pretrained_model)
model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") # nlp = init_nlp(filled)
embed = None # model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
for node in model.walk(): # embed = None
if node.name == "hashembed": # for node in model.walk():
embed = node # if node.name == "hashembed":
# ensure that the tok2vec weights are actually changed by the pretraining # embed = node
assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E"))) # # ensure that the tok2vec weights are actually changed by the pretraining
train(nlp, train_dir) # assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
# train(nlp, train_dir)
def write_sample_jsonl(tmp_dir): def write_sample_jsonl(tmp_dir):