diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh new file mode 100755 index 00000000..df84ee62 --- /dev/null +++ b/examples/wenetspeech/asr1/local/train.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +profiler_options= +benchmark_batch_size=0 +benchmark_max_step=0 + +# seed may break model convergence +seed=0 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +if [ ${seed} != 0 ]; then + export FLAGS_cudnn_deterministic=True + echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." +fi + +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" + exit -1 +fi + +config_path=$1 +ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi +echo ${ips_config} + +mkdir -p exp + +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +else +#NCCL_SOCKET_IFNAME=eth0 +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +fi + + +if [ ${seed} != 0 ]; then + unset FLAGS_cudnn_deterministic +fi + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index 9995bc63..f2f29246 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml +ips= #xxx.xxx.xxx, xxx.xxx.xxx.xxx decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -26,7 +27,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then