From 404708c64006dcff731204f9d9cbf7e616cdd7dc Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Wed, 28 Sep 2022 11:15:06 +0000
Subject: [PATCH] fix s2t gpu training hang

---
 examples/aishell/asr0/local/train.sh     | 4 ++++
 examples/aishell/asr1/local/train.sh     | 4 ++++
 examples/librispeech/asr0/local/train.sh | 4 ++++
 examples/librispeech/asr1/local/train.sh | 4 ++++
 examples/librispeech/asr2/local/train.sh | 4 ++++
 examples/timit/asr1/local/train.sh       | 4 ++++
 examples/tiny/asr0/local/train.sh        | 4 ++++
 examples/tiny/asr1/local/train.sh        | 4 ++++
 examples/wenetspeech/asr1/local/train.sh | 4 ++++
 9 files changed, 36 insertions(+)

diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh
index 256b30d22..2b71b7f76 100755
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh
index f514de303..bfa8dd97d 100755
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -35,6 +35,10 @@ echo ${ips_config}
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh
index 71659e28d..bb41fd554 100755
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh
index f729ed22c..e274b9133 100755
--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
@@ -29,6 +29,10 @@ fi
 # export FLAGS_cudnn_exhaustive_search=true
 # export FLAGS_conv_workspace_size_limit=4000
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh
index 1f414ad41..c2f2d4b65 100755
--- a/examples/librispeech/asr2/local/train.sh
+++ b/examples/librispeech/asr2/local/train.sh
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh
index 661407582..1088c7ffa 100755
--- a/examples/timit/asr1/local/train.sh
+++ b/examples/timit/asr1/local/train.sh
@@ -19,6 +19,10 @@ if [ ${seed} != 0  ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh
index 8b67902fe..e233a0c0a 100755
--- a/examples/tiny/asr0/local/train.sh
+++ b/examples/tiny/asr0/local/train.sh
@@ -32,6 +32,10 @@ fi
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh
index 459f2e218..fbfb41f6f 100755
--- a/examples/tiny/asr1/local/train.sh
+++ b/examples/tiny/asr1/local/train.sh
@@ -34,6 +34,10 @@ fi
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh
index 01af00b61..6813d270c 100755
--- a/examples/wenetspeech/asr1/local/train.sh
+++ b/examples/wenetspeech/asr1/local/train.sh
@@ -35,6 +35,10 @@ echo ${ips_config}
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \