2023-06-14 15:48:41 +00:00
import re
2018-11-30 19:16:14 +00:00
from pathlib import Path
2023-06-14 15:48:41 +00:00
from typing import Optional
2020-07-11 11:03:53 +00:00
import typer
2023-06-14 15:48:41 +00:00
from wasabi import msg
2018-11-15 21:17:16 +00:00
2020-09-28 13:09:59 +00:00
from . . training . pretrain import pretrain
from . . util import load_config
2023-06-14 15:48:41 +00:00
from . _util import (
Arg ,
Opt ,
app ,
import_code ,
parse_config_overrides ,
setup_gpu ,
show_validation_error ,
)
2018-11-15 21:17:16 +00:00
2020-07-11 17:17:59 +00:00
@app.command (
" pretrain " ,
context_settings = { " allow_extra_args " : True , " ignore_unknown_options " : True } ,
)
2020-06-21 19:35:01 +00:00
def pretrain_cli (
2020-01-01 12:15:46 +00:00
# fmt: off
2020-07-11 11:03:53 +00:00
ctx : typer . Context , # This is only used to read additional arguments
2020-12-08 09:41:18 +00:00
config_path : Path = Arg ( . . . , help = " Path to config file " , exists = True , dir_okay = False , allow_dash = True ) ,
2020-09-13 12:05:05 +00:00
output_dir : Path = Arg ( . . . , help = " Directory to write weights to on each epoch " ) ,
2020-09-18 23:17:02 +00:00
code_path : Optional [ Path ] = Opt ( None , " --code " , " -c " , help = " Path to Python file with additional code (registered functions) to be imported " ) ,
2020-06-21 11:44:00 +00:00
resume_path : Optional [ Path ] = Opt ( None , " --resume-path " , " -r " , help = " Path to pretrained weights from which to resume pretraining " ) ,
2020-08-09 20:31:52 +00:00
epoch_resume : Optional [ int ] = Opt ( None , " --epoch-resume " , " -er " , help = " The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files. " ) ,
2020-07-22 14:53:41 +00:00
use_gpu : int = Opt ( - 1 , " --gpu-id " , " -g " , help = " GPU ID or -1 for CPU " ) ,
2023-04-03 13:24:03 +00:00
skip_last : bool = Opt ( False , " --skip-last " , " -L " , help = " Skip saving model-last.bin " ) ,
2020-06-04 14:09:55 +00:00
# fmt: on
2018-11-30 19:16:14 +00:00
) :
"""
Pre - train the ' token-to-vector ' ( tok2vec ) layer of pipeline components ,
2020-07-03 15:57:28 +00:00
using an approximate language - modelling objective . Two objective types
are available , vector - based and character - based .
2020-07-06 11:06:25 +00:00
2020-07-03 15:57:28 +00:00
In the vector - based objective , we load word vectors that have been trained
using a word2vec - style distributional similarity algorithm , and train a
component like a CNN , BiLSTM , etc to predict vectors which match the
pretrained ones . The weights are saved to a directory after each epoch . You
can then pass a path to one of these pretrained weights files to the
' spacy train ' command .
2018-11-30 19:16:14 +00:00
This technique may be especially helpful if you have little labelled data .
However , it ' s still quite experimental, so your mileage may vary.
2018-11-15 21:17:16 +00:00
2018-11-30 19:16:14 +00:00
To load the weights back in during ' spacy train ' , you need to ensure
2020-06-03 12:45:00 +00:00
all settings are the same between pretraining and training . Ideally ,
this is done by using the same config file for both commands .
2020-09-04 10:58:50 +00:00
2021-01-30 09:09:38 +00:00
DOCS : https : / / spacy . io / api / cli #pretrain
2018-11-30 19:16:14 +00:00
"""
2020-09-13 12:05:05 +00:00
config_overrides = parse_config_overrides ( ctx . args )
2020-07-11 11:03:53 +00:00
import_code ( code_path )
2020-09-13 12:05:05 +00:00
verify_cli_args ( config_path , output_dir , resume_path , epoch_resume )
2020-09-28 13:09:59 +00:00
setup_gpu ( use_gpu )
2020-07-22 11:42:59 +00:00
msg . info ( f " Loading config from: { config_path } " )
2020-09-13 12:05:05 +00:00
2020-08-02 13:18:30 +00:00
with show_validation_error ( config_path ) :
2020-09-28 13:09:59 +00:00
raw_config = load_config (
2020-09-27 20:21:31 +00:00
config_path , overrides = config_overrides , interpolate = False
2020-09-13 12:05:05 +00:00
)
2020-09-27 20:21:31 +00:00
config = raw_config . interpolate ( )
2020-09-13 12:05:05 +00:00
if not config . get ( " pretraining " ) :
2020-08-24 13:56:03 +00:00
# TODO: What's the solution here? How do we handle optional blocks?
msg . fail ( " The [pretraining] block in your config is empty " , exits = 1 )
2020-07-22 11:42:59 +00:00
if not output_dir . exists ( ) :
2022-07-26 12:35:18 +00:00
output_dir . mkdir ( parents = True )
2020-07-22 11:42:59 +00:00
msg . good ( f " Created output directory: { output_dir } " )
2020-09-27 20:21:31 +00:00
# Save non-interpolated config
raw_config . to_disk ( output_dir / " config.cfg " )
2020-06-03 12:45:00 +00:00
msg . good ( " Saved config file in the output directory " )
2020-09-17 09:48:04 +00:00
2020-09-13 12:05:05 +00:00
pretrain (
config ,
output_dir ,
resume_path = resume_path ,
epoch_resume = epoch_resume ,
use_gpu = use_gpu ,
2020-09-28 19:17:10 +00:00
silent = False ,
2023-04-03 13:24:03 +00:00
skip_last = skip_last ,
2020-09-13 12:05:05 +00:00
)
2019-06-16 11:22:57 +00:00
msg . good ( " Successfully finished pretrain " )
2018-11-15 22:44:07 +00:00
2018-11-28 17:04:58 +00:00
2020-09-13 12:05:05 +00:00
def verify_cli_args ( config_path , output_dir , resume_path , epoch_resume ) :
2020-12-08 09:41:18 +00:00
if not config_path or ( str ( config_path ) != " - " and not config_path . exists ( ) ) :
2020-07-03 15:57:28 +00:00
msg . fail ( " Config file not found " , config_path , exits = 1 )
if output_dir . exists ( ) and [ p for p in output_dir . iterdir ( ) ] :
if resume_path :
msg . warn (
2020-09-03 11:13:03 +00:00
" Output directory is not empty. " ,
" If you ' re resuming a run in this directory, the old weights "
" for the consecutive epochs will be overwritten with the new ones. " ,
2020-07-03 15:57:28 +00:00
)
else :
msg . warn (
" Output directory is not empty. " ,
" It is better to use an empty directory or refer to a new output path, "
" then the new directory will be created for you. " ,
)
if resume_path is not None :
2021-04-28 07:17:15 +00:00
if resume_path . is_dir ( ) :
# This is necessary because Windows gives a Permission Denied when we
# try to open the directory later, which is confusing. See #7878
msg . fail (
" --resume-path should be a weights file, but {resume_path} is a directory. " ,
exits = True ,
)
2020-07-03 15:57:28 +00:00
model_name = re . search ( r " model \ d+ \ .bin " , str ( resume_path ) )
if not model_name and not epoch_resume :
msg . fail (
" You have to use the --epoch-resume setting when using a renamed weight file for --resume-path " ,
exits = True ,
)
elif not model_name and epoch_resume < 0 :
msg . fail (
f " The argument --epoch-resume has to be greater or equal to 0. { epoch_resume } is invalid " ,
exits = True ,
)