From f795d6f03464041766e7be465c08efb6869e0658 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 26 Aug 2022 12:52:19 +0800 Subject: [PATCH] add barrier (#2309) --- tests/benchmark/pwgan/run_benchmark.sh | 1 + tests/test_tipc/barrier.sh | 10 ++++++++++ 2 files changed, 11 insertions(+) create mode 100644 tests/test_tipc/barrier.sh diff --git a/tests/benchmark/pwgan/run_benchmark.sh b/tests/benchmark/pwgan/run_benchmark.sh index b9cc154fe..9cc070fa1 100755 --- a/tests/benchmark/pwgan/run_benchmark.sh +++ b/tests/benchmark/pwgan/run_benchmark.sh @@ -43,6 +43,7 @@ function _train(){ log_parse_file="mylog/workerlog.0" ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac + bash tests/test_tipc/barrier.sh # 以下不用修改 timeout 15m ${train_cmd} > ${log_file} 2>&1 if [ $? -ne 0 ];then diff --git a/tests/test_tipc/barrier.sh b/tests/test_tipc/barrier.sh new file mode 100644 index 000000000..d29634cc4 --- /dev/null +++ b/tests/test_tipc/barrier.sh @@ -0,0 +1,10 @@ +set -ex +NNODES=${PADDLE_TRAINERS_NUM:-"1"} +PYTHON=${PYTHON:-"python"} +TIMEOUT=${1:-"10m"} + +if [[ "$NNODES" -gt 1 ]]; then + while ! timeout "$TIMEOUT" "$PYTHON" -m paddle.distributed.launch run_check; do + echo "Retry barrier ......" + done +fi