add barrier (#2309)

pull/2311/head
sneaxiy 2 years ago committed by GitHub
parent 7cc1d66863
commit f795d6f034
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -43,6 +43,7 @@ function _train(){
log_parse_file="mylog/workerlog.0" ;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
bash tests/test_tipc/barrier.sh
# 以下不用修改
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then

@ -0,0 +1,10 @@
set -ex
NNODES=${PADDLE_TRAINERS_NUM:-"1"}
PYTHON=${PYTHON:-"python"}
TIMEOUT=${1:-"10m"}
if [[ "$NNODES" -gt 1 ]]; then
while ! timeout "$TIMEOUT" "$PYTHON" -m paddle.distributed.launch run_check; do
echo "Retry barrier ......"
done
fi
Loading…
Cancel
Save