diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index 688f0cba9..80280f5cc 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -10,9 +10,9 @@ data: min_input_len: 0.0 max_input_len: 27.0 # second min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 + max_output_len: .inf + min_output_input_ratio: 0.00 + max_output_input_ratio: .inf specgram_type: linear target_sample_rate: 16000 max_freq: None @@ -21,7 +21,7 @@ data: window_ms: 20.0 delta_delta: False dither: 1.0 - use_dB_normalization: True + use_dB_normalization: True target_dB: -20 random_seed: 0 keep_transcription_text: False @@ -41,7 +41,7 @@ training: lr: 1e-3 lr_decay: 0.83 weight_decay: 1e-06 - global_grad_clip: 5.0 + global_grad_clip: 3.0 log_interval: 100 decoding: diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index 9c3ddcfac..921f1f49a 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -17,12 +17,12 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then --manifest_prefix="data/manifest" \ --target_dir="${TARGET_DIR}/librispeech" \ --full_download="True" - + if [ $? -ne 0 ]; then echo "Prepare LibriSpeech failed. Terminated." exit 1 fi - + for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do mv data/manifest.${set} data/manifest.${set}.raw done @@ -48,7 +48,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths="data/manifest.train.raw" - + if [ $? -ne 0 ]; then echo "Build vocabulary failed. Terminated." exit 1 @@ -61,16 +61,16 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --num_samples=-1 \ + --num_samples=2000 \ --specgram_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ --window_ms=20.0 \ - --use_dB_normalization=False \ + --use_dB_normalization=True \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" - + if [ $? -ne 0 ]; then echo "Compute mean and stddev failed. Terminated." exit 1 diff --git a/examples/ngram_lm/README.md b/examples/ngram_lm/README.md new file mode 100644 index 000000000..698d7c290 --- /dev/null +++ b/examples/ngram_lm/README.md @@ -0,0 +1,7 @@ +# Ngram LM + +Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm). + +``` +bash run.sh +``` diff --git a/examples/spm/README.md b/examples/spm/README.md index 8b24b28e5..3109d3ffb 100644 --- a/examples/spm/README.md +++ b/examples/spm/README.md @@ -1,4 +1,6 @@ -# SPM demo +# [SentencePiece Model](https://github.com/google/sentencepiece) + +Train a `spm` model for English tokenizer. ``` bash run.sh