diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh
index 26e537c27..d0c47dece 100644
--- a/cloud/pcloud_train.sh
+++ b/cloud/pcloud_train.sh
@@ -15,6 +15,8 @@ python ./cloud/split_data.py \
 --in_manifest_path=${DEV_MANIFEST} \
 --out_manifest_path='/local.manifest.dev'
 
+mkdir ./logs
+
 python -u train.py \
 --batch_size=${BATCH_SIZE} \
 --trainer_count=${NUM_GPU} \
@@ -35,10 +37,10 @@ python -u train.py \
 --train_manifest='/local.manifest.train' \
 --dev_manifest='/local.manifest.dev' \
 --mean_std_path='data/librispeech/mean_std.npz' \
---vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
 --output_model_dir='./checkpoints' \
 --output_model_dir=${MODEL_PATH} \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped' \
-2>&1 | tee ./log/train.log
+2>&1 | tee ./logs/train.log
diff --git a/examples/librispeech/run_train.sh b/examples/librispeech/run_train.sh
index 9aa5e0d16..1d18f29ef 100644
--- a/examples/librispeech/run_train.sh
+++ b/examples/librispeech/run_train.sh
@@ -17,6 +17,7 @@ python -u train.py \
 --learning_rate=5e-4 \
 --max_duration=27.0 \
 --min_duration=0.0 \
+--test_off=False \
 --use_sortagrad=True \
 --use_gru=False \
 --use_gpu=True \
diff --git a/examples/tiny/run_train.sh b/examples/tiny/run_train.sh
index 3c2b8a1e0..957aa63bc 100644
--- a/examples/tiny/run_train.sh
+++ b/examples/tiny/run_train.sh
@@ -17,6 +17,7 @@ python -u train.py \
 --learning_rate=1e-5 \
 --max_duration=27.0 \
 --min_duration=0.0 \
+--test_off=False \
 --use_sortagrad=True \
 --use_gru=False \
 --use_gpu=True \
diff --git a/model_utils/model.py b/model_utils/model.py
index eb59268da..67a41bd11 100644
--- a/model_utils/model.py
+++ b/model_utils/model.py
@@ -60,7 +60,8 @@ class DeepSpeech2Model(object):
               num_passes,
               output_model_dir,
               is_local=True,
-              num_iterations_print=100):
+              num_iterations_print=100,
+              test_off=False):
         """Train the model.
 
         :param train_batch_reader: Train data reader.
@@ -83,6 +84,8 @@ class DeepSpeech2Model(object):
         :type is_local: bool
         :param output_model_dir: Directory for saving the model (every pass).
         :type output_model_dir: basestring
+        :param test_off: Turn off testing.
+        :type test_off: bool
         """
         # prepare model output directory
         if not os.path.exists(output_model_dir):
@@ -120,14 +123,19 @@ class DeepSpeech2Model(object):
                 start_time = time.time()
                 cost_sum, cost_counter = 0.0, 0
             if isinstance(event, paddle.event.EndPass):
-                result = trainer.test(
-                    reader=dev_batch_reader, feeding=feeding_dict)
+                if test_off:
+                    print("\n------- Time: %d sec,  Pass: %d" %
+                          (time.time() - start_time, event.pass_id))
+                else:
+                    result = trainer.test(
+                        reader=dev_batch_reader, feeding=feeding_dict)
+                    print("\n------- Time: %d sec,  Pass: %d, "
+                          "ValidationCost: %s" %
+                          (time.time() - start_time, event.pass_id, 0))
                 output_model_path = os.path.join(
                     output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
                 with gzip.open(output_model_path, 'w') as f:
                     self._parameters.to_tar(f)
-                print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
-                      (time.time() - start_time, event.pass_id, result.cost))
 
         # run train
         trainer.train(
diff --git a/tools/profile.sh b/tools/profile.sh
new file mode 100644
index 000000000..19abe7ede
--- /dev/null
+++ b/tools/profile.sh
@@ -0,0 +1,30 @@
+#! /usr/bin/env bash
+
+BATCH_SIZE_PER_GPU=64
+MIN_DURATION=6.0
+MAX_DURATION=7.0
+
+function join_by { local IFS="$1"; shift; echo "$*"; }
+
+for NUM_GPUS in 16 8 4 2 1
+do
+  DEVICES=$(join_by , $(seq 0 $(($NUM_GPUS-1))))
+  BATCH_SIZE=$(($BATCH_SIZE_PER_GPU * $NUM_GPUS))
+
+  CUDA_VISIBLE_DEVICES=$DEVICES \
+  python train.py \
+  --batch_size=$BATCH_SIZE \
+  --num_passes=1 \
+  --test_off=True \
+  --trainer_count=$NUM_GPUS \
+  --min_duration=$MIN_DURATION \
+  --max_duration=$MAX_DURATION > tmp.log 2>&1
+
+  if [ $? -ne 0 ];then
+      exit 1
+  fi
+
+  cat tmp.log  | grep "Time" | awk '{print "GPU Num: " "'"$NUM_GPUS"'" "	Time: "$3}'
+
+  rm tmp.log
+done
diff --git a/train.py b/train.py
index 406484a18..445f3d765 100644
--- a/train.py
+++ b/train.py
@@ -25,6 +25,7 @@ add_arg('num_iter_print',   int,    100,    "Every # iterations for printing "
 add_arg('learning_rate',    float,  5e-4,   "Learning rate.")
 add_arg('max_duration',     float,  27.0,   "Longest audio duration allowed.")
 add_arg('min_duration',     float,  0.0,    "Shortest audio duration allowed.")
+add_arg('test_off',         bool,   False,  "Turn off testing.")
 add_arg('use_sortagrad',    bool,   True,   "Use SortaGrad or not.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
@@ -111,7 +112,8 @@ def train():
         num_passes=args.num_passes,
         num_iterations_print=args.num_iter_print,
         output_model_dir=args.output_model_dir,
-        is_local=args.is_local)
+        is_local=args.is_local,
+        test_off=args.test_off)
 
 
 def main():