From 467e8235776371156682963d86c25c3f924cf5f9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 May 2021 04:57:41 +0000 Subject: [PATCH] parallel data scripts; more mask test; need pybind11 repo --- .gitignore | 2 +- deepspeech/exps/deepspeech2/model.py | 2 +- examples/aishell/s0/local/data.sh | 25 ++++++++++++---------- examples/aishell/s1/local/data.sh | 31 +++++++++++++++------------- examples/tiny/s1/run.sh | 0 requirements.txt | 3 ++- tests/mask_test.py | 2 ++ 7 files changed, 37 insertions(+), 28 deletions(-) mode change 100644 => 100755 examples/tiny/s1/run.sh diff --git a/.gitignore b/.gitignore index 9b225057f..6fa377222 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .DS_Store *.pyc .vscode -*.log +*log *.pdmodel *.pdiparams* *.zip diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index c1fe82250..18c05c137 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -170,7 +170,7 @@ class DeepSpeech2Trainer(Trainer): train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, - num_workers=config.data.num_workers, ) + num_workers=config.data.num_workers) self.valid_loader = DataLoader( dev_dataset, batch_size=config.data.batch_size, diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index f2a5dfc36..12fee1766 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -66,19 +66,22 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for dataset in train dev test; do + { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ - --cmvn_path "data/mean_std.json" \ - --unit_type "char" \ - --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${dataset}.raw" \ - --output_path="data/manifest.${dataset}" - done + --feat_type "raw" \ + --cmvn_path "data/mean_std.json" \ + --unit_type "char" \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${dataset}.raw" \ + --output_path="data/manifest.${dataset}" - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + } & + done + wait fi echo "Aishell data preparation done." diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index cdc352d1d..c6abce3b4 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -14,7 +14,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then python3 ${TARGET_DIR}/aishell/aishell.py \ --manifest_prefix="data/manifest" \ --target_dir="${TARGET_DIR}/aishell" - + if [ $? -ne 0 ]; then echo "Prepare Aishell failed. Terminated." exit 1 @@ -33,7 +33,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths "data/manifest.train.raw" - + if [ $? -ne 0 ]; then echo "Build vocabulary failed. Terminated." exit 1 @@ -56,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --num_samples=-1 \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" - + if [ $? -ne 0 ]; then echo "Compute mean and stddev failed. Terminated." exit 1 @@ -67,19 +67,22 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for dataset in train dev test; do + { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ - --cmvn_path "data/mean_std.json" \ - --unit_type "char" \ - --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${dataset}.raw" \ - --output_path="data/manifest.${dataset}" + --feat_type "raw" \ + --cmvn_path "data/mean_std.json" \ + --unit_type "char" \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${dataset}.raw" \ + --output_path="data/manifest.${dataset}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + } & done - - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi + wait fi echo "Aishell data preparation done." diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt index fc24e50e7..315bb69ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ SoundFile==0.9.0.post1 sox tensorboardX typeguard -yacs \ No newline at end of file +yacs +pybind11 diff --git a/tests/mask_test.py b/tests/mask_test.py index c4a843e32..cdad892f5 100644 --- a/tests/mask_test.py +++ b/tests/mask_test.py @@ -48,7 +48,9 @@ class TestU2Model(unittest.TestCase): def test_make_pad_mask(self): res = make_pad_mask(self.lengths) + res1 = make_non_pad_mask(self.lengths).logical_not() self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist()) + self.assertSequenceEqual(res.numpy().tolist(), res1.tolist()) if __name__ == '__main__':