train: don't save per-worker checkpoints if we're not doing distributed training
Saves disk space
This commit is contained in:
parent
a36f2efb8c
commit
d4b35d7ae6
1
train.py
1
train.py
|
@ -216,7 +216,6 @@ def train(args, model, opt, train_iters, train_iterations, field, rank=0, world_
|
|||
if world_size > 1:
|
||||
torch.distributed.barrier()
|
||||
torch.save(opt.state_dict(), os.path.join(args.log_dir, f'iteration_{iteration}_rank_{rank}_optim.pth'))
|
||||
if world_size > 1:
|
||||
torch.distributed.barrier()
|
||||
|
||||
# lr update
|
||||
|
|
Loading…
Reference in New Issue