diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh index 26e537c27..d0c47dece 100644 --- a/cloud/pcloud_train.sh +++ b/cloud/pcloud_train.sh @@ -15,6 +15,8 @@ python ./cloud/split_data.py \ --in_manifest_path=${DEV_MANIFEST} \ --out_manifest_path='/local.manifest.dev' +mkdir ./logs + python -u train.py \ --batch_size=${BATCH_SIZE} \ --trainer_count=${NUM_GPU} \ @@ -35,10 +37,10 @@ python -u train.py \ --train_manifest='/local.manifest.train' \ --dev_manifest='/local.manifest.dev' \ --mean_std_path='data/librispeech/mean_std.npz' \ ---vocab_path='data/librispeech/eng_vocab.txt' \ +--vocab_path='data/librispeech/vocab.txt' \ --output_model_dir='./checkpoints' \ --output_model_dir=${MODEL_PATH} \ --augment_conf_path='conf/augmentation.config' \ --specgram_type='linear' \ --shuffle_method='batch_shuffle_clipped' \ -2>&1 | tee ./log/train.log +2>&1 | tee ./logs/train.log diff --git a/examples/librispeech/run_train.sh b/examples/librispeech/run_train.sh index 9aa5e0d16..1d18f29ef 100644 --- a/examples/librispeech/run_train.sh +++ b/examples/librispeech/run_train.sh @@ -17,6 +17,7 @@ python -u train.py \ --learning_rate=5e-4 \ --max_duration=27.0 \ --min_duration=0.0 \ +--test_off=False \ --use_sortagrad=True \ --use_gru=False \ --use_gpu=True \ diff --git a/examples/tiny/run_train.sh b/examples/tiny/run_train.sh index 3c2b8a1e0..957aa63bc 100644 --- a/examples/tiny/run_train.sh +++ b/examples/tiny/run_train.sh @@ -17,6 +17,7 @@ python -u train.py \ --learning_rate=1e-5 \ --max_duration=27.0 \ --min_duration=0.0 \ +--test_off=False \ --use_sortagrad=True \ --use_gru=False \ --use_gpu=True \ diff --git a/model_utils/model.py b/model_utils/model.py index eb59268da..67a41bd11 100644 --- a/model_utils/model.py +++ b/model_utils/model.py @@ -60,7 +60,8 @@ class DeepSpeech2Model(object): num_passes, output_model_dir, is_local=True, - num_iterations_print=100): + num_iterations_print=100, + test_off=False): """Train the model. :param train_batch_reader: Train data reader. @@ -83,6 +84,8 @@ class DeepSpeech2Model(object): :type is_local: bool :param output_model_dir: Directory for saving the model (every pass). :type output_model_dir: basestring + :param test_off: Turn off testing. + :type test_off: bool """ # prepare model output directory if not os.path.exists(output_model_dir): @@ -120,14 +123,19 @@ class DeepSpeech2Model(object): start_time = time.time() cost_sum, cost_counter = 0.0, 0 if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=dev_batch_reader, feeding=feeding_dict) + if test_off: + print("\n------- Time: %d sec, Pass: %d" % + (time.time() - start_time, event.pass_id)) + else: + result = trainer.test( + reader=dev_batch_reader, feeding=feeding_dict) + print("\n------- Time: %d sec, Pass: %d, " + "ValidationCost: %s" % + (time.time() - start_time, event.pass_id, 0)) output_model_path = os.path.join( output_model_dir, "params.pass-%d.tar.gz" % event.pass_id) with gzip.open(output_model_path, 'w') as f: self._parameters.to_tar(f) - print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % - (time.time() - start_time, event.pass_id, result.cost)) # run train trainer.train( diff --git a/tools/profile.sh b/tools/profile.sh new file mode 100644 index 000000000..19abe7ede --- /dev/null +++ b/tools/profile.sh @@ -0,0 +1,30 @@ +#! /usr/bin/env bash + +BATCH_SIZE_PER_GPU=64 +MIN_DURATION=6.0 +MAX_DURATION=7.0 + +function join_by { local IFS="$1"; shift; echo "$*"; } + +for NUM_GPUS in 16 8 4 2 1 +do + DEVICES=$(join_by , $(seq 0 $(($NUM_GPUS-1)))) + BATCH_SIZE=$(($BATCH_SIZE_PER_GPU * $NUM_GPUS)) + + CUDA_VISIBLE_DEVICES=$DEVICES \ + python train.py \ + --batch_size=$BATCH_SIZE \ + --num_passes=1 \ + --test_off=True \ + --trainer_count=$NUM_GPUS \ + --min_duration=$MIN_DURATION \ + --max_duration=$MAX_DURATION > tmp.log 2>&1 + + if [ $? -ne 0 ];then + exit 1 + fi + + cat tmp.log | grep "Time" | awk '{print "GPU Num: " "'"$NUM_GPUS"'" " Time: "$3}' + + rm tmp.log +done diff --git a/train.py b/train.py index 406484a18..445f3d765 100644 --- a/train.py +++ b/train.py @@ -25,6 +25,7 @@ add_arg('num_iter_print', int, 100, "Every # iterations for printing " add_arg('learning_rate', float, 5e-4, "Learning rate.") add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") +add_arg('test_off', bool, False, "Turn off testing.") add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") @@ -111,7 +112,8 @@ def train(): num_passes=args.num_passes, num_iterations_print=args.num_iter_print, output_model_dir=args.output_model_dir, - is_local=args.is_local) + is_local=args.is_local, + test_off=args.test_off) def main():