diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 460c24304..b3364881b 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -18,6 +18,7 @@ from deepspeech.exps.u2.model import U2Trainer from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model + _C = CfgNode() _C.data = ManifestDataset.params() diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 3b1fbe7c6..4f07d7464 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -148,6 +148,8 @@ class FeatureNormalizer(object): batch_size=64, eps=1e-20): """Compute mean and std from randomly sampled instances.""" + paddle.set_device('cpu') + collate_func = CollateFunc(featurize_func) dataset = AudioDataset(manifest_path, num_samples, self._rng) data_loader = DataLoader( diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index 6b93d089d..39a9654bc 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -61,12 +61,14 @@ class U2BaseModel(nn.Module): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: # network architecture default = CfgNode() + # allow add new item when merge_with_file + default.set_new_allowed(True) default.cmvn_file = "" default.cmvn_file_type = "npz" default.input_dim = 0 default.output_dim = 0 # encoder related - default.encoder = 'conformer' + default.encoder = 'transformer' default.encoder_conf = CfgNode( dict( output_size=256, # dimension of attention @@ -78,11 +80,12 @@ class U2BaseModel(nn.Module): attention_dropout_rate=0.0, input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 normalize_before=True, - cnn_module_kernel=15, - use_cnn_module=True, - activation_type='swish', - pos_enc_layer_type='rel_pos', - selfattention_layer_type='rel_selfattn', )) + # use_cnn_module=True, + # cnn_module_kernel=15, + # activation_type='swish', + # pos_enc_layer_type='rel_pos', + # selfattention_layer_type='rel_selfattn', + )) # decoder related default.decoder = 'transformer' default.decoder_conf = CfgNode( diff --git a/examples/librispeech/s1/conf/chunk_confermer.yaml b/examples/librispeech/s1/conf/chunk_confermer.yaml index 43e7802b6..3ee31e1b2 100644 --- a/examples/librispeech/s1/conf/chunk_confermer.yaml +++ b/examples/librispeech/s1/conf/chunk_confermer.yaml @@ -1,11 +1,11 @@ # https://yaml.org/type/float.html data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' + spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/augmentation.json batch_size: 4 diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 721cc3f37..265e6e0b6 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -1,11 +1,11 @@ # https://yaml.org/type/float.html data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' + spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/augmentation.json batch_size: 64 diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 576d2ca0c..1981b946f 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -1,8 +1,8 @@ # https://yaml.org/type/float.html data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 8113a791f..8ef9e12f1 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -1,11 +1,11 @@ # https://yaml.org/type/float.html data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - vocab_filepath: data/vocab.txt + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' + spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/augmentation.json batch_size: 64 diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 3c5cfc6fd..a83c3414d 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -27,8 +27,20 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi + for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do + mv data/manifest.${set} data/manifest.${set}.raw + done + for set in train-clean-100 train-clean-360 train-other-500; do - cat data/manifest.${set} >> data/manifest.train.raw + cat data/manifest.${set}.raw >> data/manifest.train.raw + done + + for set in dev-clean dev-other; do + cat data/manifest.${set}.raw >> data/manifest.dev.raw + done + + for set in test-clean test-other; do + cat data/manifest.${set}.raw >> data/manifest.test.raw done fi @@ -73,20 +85,24 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size - python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ - --cmvn_path "data/mean_std.json" \ - --unit_type "spm" \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.train.raw" \ - --output_path="data/manifest.train" - - - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi + for set in train dev test dev-clean dev-other test-clean test-other; do + { + python3 ${MAIN_ROOT}/utils/format_data.py \ + --feat_type "raw" \ + --cmvn_path "data/mean_std.json" \ + --unit_type "spm" \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${set}.raw" \ + --output_path="data/manifest.${set}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + }& + done + wait fi echo "LibriSpeech Data preparation done." diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index c166baecb..7cfc5686f 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -4,7 +4,8 @@ source path.sh stage=0 stop_stage=100 -ckpt=conformer +conf_path=conf/transformer.yaml +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') avg_num=30 avg_ckpt=avg_${avg_num} @@ -17,7 +18,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh conf/conformer.yaml ${ckpt} + CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -27,10 +28,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index d70863ed6..ffb1958c0 100644 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -4,7 +4,8 @@ source path.sh stage=0 stop_stage=100 -ckpt=conformer +conf_path=conf/transformer.yaml +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') avg_num=1 avg_ckpt=avg_${avg_num} @@ -17,7 +18,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0 ./local/train.sh conf/conformer.yaml ${ckpt} + CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -27,10 +28,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi