From 404708c64006dcff731204f9d9cbf7e616cdd7dc Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 28 Sep 2022 11:15:06 +0000 Subject: [PATCH] fix s2t gpu training hang --- examples/aishell/asr0/local/train.sh | 4 ++++ examples/aishell/asr1/local/train.sh | 4 ++++ examples/librispeech/asr0/local/train.sh | 4 ++++ examples/librispeech/asr1/local/train.sh | 4 ++++ examples/librispeech/asr2/local/train.sh | 4 ++++ examples/timit/asr1/local/train.sh | 4 ++++ examples/tiny/asr0/local/train.sh | 4 ++++ examples/tiny/asr1/local/train.sh | 4 ++++ examples/wenetspeech/asr1/local/train.sh | 4 ++++ 9 files changed, 36 insertions(+) diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 256b30d22..2b71b7f76 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index f514de303..bfa8dd97d 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -35,6 +35,10 @@ echo ${ips_config} mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index 71659e28d..bb41fd554 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index f729ed22c..e274b9133 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -29,6 +29,10 @@ fi # export FLAGS_cudnn_exhaustive_search=true # export FLAGS_conv_workspace_size_limit=4000 +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index 1f414ad41..c2f2d4b65 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh index 661407582..1088c7ffa 100755 --- a/examples/timit/asr1/local/train.sh +++ b/examples/timit/asr1/local/train.sh @@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index 8b67902fe..e233a0c0a 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -32,6 +32,10 @@ fi mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index 459f2e218..fbfb41f6f 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -34,6 +34,10 @@ fi mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh index 01af00b61..6813d270c 100755 --- a/examples/wenetspeech/asr1/local/train.sh +++ b/examples/wenetspeech/asr1/local/train.sh @@ -35,6 +35,10 @@ echo ${ips_config} mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \