From 429221dc0379eb0435f5e3e6194d7191ab571831 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 29 Jun 2022 03:30:04 +0000 Subject: [PATCH] adopt multi machine traiing --- examples/wenetspeech/asr1/conf/conformer.yaml | 2 +- examples/wenetspeech/asr1/local/train.sh | 3 +-- paddlespeech/audio/streamdata/shardlists.py | 2 ++ paddlespeech/audio/streamdata/utils.py | 10 +++++++--- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index f46d4bd95..013c3e0c4 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -67,7 +67,7 @@ maxlen_out: 150 # if output length(number of tokens) > maxlen-out, data is auto resample_rate: 16000 shuffle_size: 1500 sort_size: 1000 -num_workers: 0 +num_workers: 8 prefetch_factor: 10 dist_sampler: True num_encs: 1 diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh index df84ee625..01af00b61 100755 --- a/examples/wenetspeech/asr1/local/train.sh +++ b/examples/wenetspeech/asr1/local/train.sh @@ -45,8 +45,7 @@ python3 -u ${BIN_DIR}/train.py \ --benchmark-batch-size ${benchmark_batch_size} \ --benchmark-max-step ${benchmark_max_step} else -#NCCL_SOCKET_IFNAME=eth0 -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ +NCCL_SOCKET_IFNAME=eth0 python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --seed ${seed} \ --config ${config_path} \ diff --git a/paddlespeech/audio/streamdata/shardlists.py b/paddlespeech/audio/streamdata/shardlists.py index 3d1801cc7..cfaf9a64b 100644 --- a/paddlespeech/audio/streamdata/shardlists.py +++ b/paddlespeech/audio/streamdata/shardlists.py @@ -65,6 +65,7 @@ class SimpleShardList(IterableDataset): def split_by_node(src, group=None): rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group) + logger.info(f"world_size:{world_size}, rank:{rank}") if world_size > 1: for s in islice(src, rank, None, world_size): yield s @@ -83,6 +84,7 @@ def single_node_only(src, group=None): def split_by_worker(src): rank, world_size, worker, num_workers = utils.paddle_worker_info() + logger.info(f"num_workers:{num_workers}, worker:{worker}") if num_workers > 1: for s in islice(src, worker, None, num_workers): yield s diff --git a/paddlespeech/audio/streamdata/utils.py b/paddlespeech/audio/streamdata/utils.py index 83a42badb..c7294f2bf 100644 --- a/paddlespeech/audio/streamdata/utils.py +++ b/paddlespeech/audio/streamdata/utils.py @@ -16,6 +16,9 @@ import re import sys from typing import Any, Callable, Iterator, Optional, Union +from ..utils.log import Logger + +logger = Logger(__name__) def make_seed(*args): seed = 0 @@ -112,13 +115,14 @@ def paddle_worker_info(group=None): num_workers = int(os.environ["NUM_WORKERS"]) else: try: - import paddle.io.get_worker_info + from paddle.io import get_worker_info worker_info = paddle.io.get_worker_info() if worker_info is not None: worker = worker_info.id num_workers = worker_info.num_workers - except ModuleNotFoundError: - pass + except ModuleNotFoundError as E: + logger.info(f"not found {E}") + exit(-1) return rank, world_size, worker, num_workers