diff --git a/README.md b/README.md
index ec2d0f302..c2058fd70 100644
--- a/README.md
+++ b/README.md
@@ -247,7 +247,7 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco
- TransformerTTS |
+ Transformer TTS |
transformer-ljspeech
|
@@ -267,7 +267,7 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco
- Vocoder |
+ Vocoder |
WaveFlow |
LJSpeech |
@@ -282,20 +282,33 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco
|
- Voice Cloning |
- GE2E |
- AISHELL-3, etc. |
-
- ge2e
- |
+ Multi Band MelGAN |
+ CSMSC |
+
+ Multi Band MelGAN-csmsc
+ |
+
+
+ Voice Cloning |
+ GE2E |
+ AISHELL-3, etc. |
+
+ ge2e
+ |
- GE2E + Tactron2 |
- AISHELL-3 |
-
- ge2e-tactron2-aishell3
- |
-
+ GE2E + Tactron2 |
+ AISHELL-3 |
+
+ ge2e-tactron2-aishell3
+ |
+
+
+ GE2E + FastSpeech2 |
+ AISHELL-3 |
+
+ ge2e-fastspeech2-aishell3
+ |
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index ca04f6a74..2e3d8106e 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -4,13 +4,13 @@
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/s0)
-[Ds2 Offline Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/s0)
-[Conformer Online Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
-[Conformer Offline Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
-[Conformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech S2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+[Ds2 Online Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Offline Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
+[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
+[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
+[Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
### Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml
index 8fb4e6e8d..bdfa42199 100644
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@@ -14,7 +14,7 @@ collator:
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json
unit_type: char
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
index 29ec2379f..010d8f155 100644
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -14,7 +14,7 @@ collator:
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json
unit_type: char
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
diff --git a/examples/aishell/asr0/local/data.sh b/examples/aishell/asr0/local/data.sh
index 23f04f2a6..1032cedc8 100755
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -3,9 +3,12 @@
stage=-1
stop_stage=100
-source ${MAIN_ROOT}/utils/parse_options.sh
+dict_dir=data/lang_char
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@@ -52,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
if [ $? -ne 0 ]; then
@@ -68,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
index 336a6c462..e07cd07c5 100644
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
index 0e9d79d8b..154f44a25 100644
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
index c021f66b7..d13f9e2f3 100644
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
diff --git a/examples/aishell/asr1/local/data.sh b/examples/aishell/asr1/local/data.sh
index 76e280752..418432318 100755
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -2,10 +2,12 @@
stage=-1
stop_stage=100
+dict_dir=data/lang_char
-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@@ -53,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
@@ -69,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
diff --git a/examples/aishell/asr1/model_performance.md b/examples/aishell/asr1/model_performance.md
index 8c53f95f6..da753634a 100644
--- a/examples/aishell/asr1/model_performance.md
+++ b/examples/aishell/asr1/model_performance.md
@@ -28,4 +28,4 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
\ No newline at end of file
+| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 056f35ba9..eb2cca2e2 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -5,7 +5,7 @@ AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpu
We use AISHELL-3 to train a multi-speaker fastspeech2 model here.
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download AISHELL-3.
```bash
wget https://www.openslr.org/resources/93/data_aishell3.tgz
@@ -15,7 +15,7 @@ Extract AISHELL-3.
mkdir data_aishell3
tar zxvf data_aishell3.tgz -C data_aishell3
```
-### Get MFA result of AISHELL-3 and Extract it
+### Get MFA Result and Extract
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
@@ -32,7 +32,12 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -58,7 +63,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
@@ -95,7 +100,7 @@ optional arguments:
5. `--phones-dict` is the path of the phone vocabulary file.
6. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2.
-### Synthesize
+### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh
index 95e4d38fe..b375f2159 100755
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 376d4a331..fa5c66941 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -16,11 +16,15 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
```
-#### generate speaker embedding
+#### Generate Speaker Embedding
Use pretrained GE2E (speaker encoder) to generate speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
```bash
@@ -34,7 +38,7 @@ fi
```
The computing time of utterance embedding can be x hours.
-#### process wav
+#### Process Wav
There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence.
We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
@@ -53,7 +57,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
```
-#### preprocess transcription
+#### Preprocess Transcription
We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
```bash
@@ -64,7 +68,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
```
The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
-#### extract mel
+#### Extract Mel
```python
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/extract_mel.py \
@@ -73,7 +77,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
fi
```
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
```
diff --git a/examples/aishell3/vc0/run.sh b/examples/aishell3/vc0/run.sh
index 8d3da7813..870360c1c 100755
--- a/examples/aishell3/vc0/run.sh
+++ b/examples/aishell3/vc0/run.sh
@@ -23,7 +23,7 @@ waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams
vc_input=ref_audio
vc_output=syn_audio
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index ae53443ef..635cde896 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -5,7 +5,7 @@ This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2
3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download AISHELL-3.
```bash
wget https://www.openslr.org/resources/93/data_aishell3.tgz
@@ -15,11 +15,11 @@ Extract AISHELL-3.
mkdir data_aishell3
tar zxvf data_aishell3.tgz -C data_aishell3
```
-### Get MFA result of AISHELL-3 and Extract it
+### Get MFA Result and Extract
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
-## Pretrained GE2E model
+## Pretrained GE2E Model
We use pretrained GE2E model to generate spwaker embedding for each sentence.
Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
@@ -38,7 +38,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path}
```
@@ -75,14 +79,14 @@ Also there is a `metadata.jsonl` in each subfolder. It is a table-like file whic
The preprocessing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but there is one more `ge2e/inference` step here.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
The training step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`.
-### Synthesize
+### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
diff --git a/examples/aishell3/vc1/run.sh b/examples/aishell3/vc1/run.sh
index 4eae1bdd8..64f4ee3bc 100755
--- a/examples/aishell3/vc1/run.sh
+++ b/examples/aishell3/vc1/run.sh
@@ -18,7 +18,7 @@ ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
# include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index bc28bba10..6ee6d39b1 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -3,7 +3,7 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems.
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download AISHELL-3.
```bash
wget https://www.openslr.org/resources/93/data_aishell3.tgz
@@ -13,7 +13,7 @@ Extract AISHELL-3.
mkdir data_aishell3
tar zxvf data_aishell3.tgz -C data_aishell3
```
-### Get MFA result of AISHELL-3 and Extract it
+### Get MFA Result and Extract
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
@@ -29,7 +29,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -53,7 +57,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
@@ -100,7 +104,7 @@ benchmark:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
diff --git a/examples/aishell3/voc1/run.sh b/examples/aishell3/voc1/run.sh
index 7d0fdb21e..4f426ea02 100755
--- a/examples/aishell3/voc1/run.sh
+++ b/examples/aishell3/voc1/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml
index b18b46fe6..d20d2b9a6 100644
--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml
index 47c438a6d..f86cd4a36 100644
--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
diff --git a/examples/callcenter/asr1/local/data.sh b/examples/callcenter/asr1/local/data.sh
index c40c752ab..fe2d3429c 100755
--- a/examples/callcenter/asr1/local/data.sh
+++ b/examples/callcenter/asr1/local/data.sh
@@ -2,10 +2,12 @@
stage=-1
stop_stage=100
+dict_dir=data/lang_char
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
+mkdir -p ${dict_dir}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
for dataset in train dev test; do
@@ -41,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
@@ -57,7 +59,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 5ebf3cf4e..80e50fe93 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -2,10 +2,10 @@
This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
-### Get MFA result of CSMSC and Extract it
+### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
@@ -23,7 +23,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -47,7 +51,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
@@ -88,7 +92,7 @@ optional arguments:
5. `--phones-dict` is the path of the phone vocabulary file.
6. `--tones-dict` is the path of the tone vocabulary file.
-### Synthesize
+### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
@@ -200,7 +204,7 @@ optional arguments:
7. `--phones-dict` is the path of the phone vocabulary file.
8. `--tones-dict` is the path of the tone vocabulary file.
-### Inference
+### Inferencing
After Synthesize, we will get static models of speedyspeech and pwgan in `${train_output_path}/inference`.
`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for speedyspeech + pwgan synthesize.
```bash
diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh
index 200e81929..8b8f53bd0 100755
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_76.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 104964c85..c99690e1f 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -2,10 +2,10 @@
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
-### Get MFA result of CSMSC and Extract it
+### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
@@ -23,7 +23,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -50,7 +54,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
@@ -86,7 +90,7 @@ optional arguments:
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
5. `--phones-dict` is the path of the phone vocabulary file.
-### Synthesize
+### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
@@ -191,7 +195,7 @@ optional arguments:
5. `--output-dir` is the directory to save synthesized audio files.
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-### Inference
+### Inferencing
After Synthesize, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`.
`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for fastspeech2 + pwgan synthesize.
```bash
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index 718d60760..c1ddd3b98 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index 86114a423..9d516be43 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -1,11 +1,11 @@
# Parallel WaveGAN with CSMSC
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
-### Get MFA results for silence trim
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
## Get Started
@@ -20,7 +20,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -43,7 +47,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
@@ -90,7 +94,7 @@ benchmark:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh
index 163095439..cab1ac38b 100755
--- a/examples/csmsc/voc1/run.sh
+++ b/examples/csmsc/voc1/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 4925b649d..0a64d1a18 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -1,11 +1,11 @@
# Multi Band MelGAN with CSMSC
This example contains code used to train a [Multi Band MelGAN](https://arxiv.org/abs/2005.05106) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
-### Get MFA results for silence trim
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
## Get Started
@@ -20,7 +20,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -43,7 +47,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
@@ -75,7 +79,7 @@ optional arguments:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@@ -106,7 +110,7 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-## Finetune
+## Fine-tuning
Since there are no `noise` in the input of Multi Band MelGAN, the audio quality is not so good (see [espnet issue](https://github.com/espnet/espnet/issues/3536#issuecomment-916035415)), we refer to the method proposed in [HiFiGAN](https://arxiv.org/abs/2010.05646), finetune Multi Band MelGAN with the predicted mel-spectrogram from `FastSpeech2`.
The length of mel-spectrograms should align with the length of wavs, so we should generate mels using ground truth alignment.
@@ -144,7 +148,7 @@ Run the command below
By default, `finetune.sh` will use `conf/finetune.yaml` as config, the dump-dir is `dump_finetune`, the experiment dir is `exp/finetune`.
TODO:
-The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
+The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models
Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip).
diff --git a/examples/csmsc/voc3/run.sh b/examples/csmsc/voc3/run.sh
index 360f6ec2a..3e7d7e2ab 100755
--- a/examples/csmsc/voc3/run.sh
+++ b/examples/csmsc/voc3/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_50000.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py
index 95ed04086..7431fc083 100644
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@@ -82,7 +82,7 @@ def create_manifest(data_dir, manifest_path_prefix):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
-
+
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py
index a8cbb8379..9a3ba3b31 100644
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -73,7 +73,6 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
-
translation_str = " ".join(translation.split())
trancription_str = " ".join(trancription.split())
json_lines.append(
@@ -82,7 +81,7 @@ def create_manifest(data_dir, manifest_path_prefix):
'utt': utt,
'feat': audio_path,
'feat_shape': (duration, ), # second
- 'text': [translation_str, trancription_str],
+ 'text': [translation_str, trancription_str],
},
ensure_ascii=False))
diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py
index 2ec4ddab2..cdfc0a75c 100644
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@@ -124,7 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
- 'utt2spk', spk,
+ 'utt2spk': spk,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': word_text, # charactor
diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py
index 26aa76c72..473fc856f 100644
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@@ -22,9 +22,9 @@ import argparse
import codecs
import json
import os
+from pathlib import Path
import soundfile
-from pathlib import Path
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
index 8afaabf42..70fa3fcb2 100644
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@@ -14,7 +14,7 @@ collator:
batch_size: 20
mean_std_filepath: data/mean_std.json
unit_type: char
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
index d6ab95237..3e07862d6 100644
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@@ -14,7 +14,7 @@ collator:
batch_size: 15
mean_std_filepath: data/mean_std.json
unit_type: char
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
diff --git a/examples/librispeech/asr0/local/data.sh b/examples/librispeech/asr0/local/data.sh
index 0f276ceca..fa2c9b2f7 100755
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -4,10 +4,12 @@ stage=-1
stop_stage=100
unit_type=char
+dict_dir=data/lang_char
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@@ -67,7 +69,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
@@ -83,7 +85,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/README.md
index 20255db8e..2ea55fc90 100644
--- a/examples/librispeech/asr1/README.md
+++ b/examples/librispeech/asr1/README.md
@@ -21,7 +21,7 @@
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |
\ No newline at end of file
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.733129533131917 | 0.047874 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.733129533131917 | 0.053922 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.733129533131917 | 0.053427 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.733129533131917 | 0.041369 |
diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
index 2bfb0fb6f..4a5741904 100644
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -11,9 +11,9 @@ data:
max_output_input_ratio: 100.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
- spm_model_prefix: 'data/bpe_unigram_5000'
+ spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16
diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
index c844baaaf..684b62976 100644
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -11,9 +11,9 @@ data:
max_output_input_ratio: 100.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
- spm_model_prefix: 'data/bpe_unigram_5000'
+ spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index 5a158f3ed..1806f3fd6 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -11,9 +11,9 @@ data:
max_output_input_ratio: 100.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
- spm_model_prefix: 'data/bpe_unigram_5000'
+ spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 32
diff --git a/examples/librispeech/asr1/local/data.sh b/examples/librispeech/asr1/local/data.sh
index 35f4e635f..a0bf9a2d3 100755
--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -2,11 +2,12 @@
stage=-1
stop_stage=100
+dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
stride_ms=10
window_ms=25
@@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@@ -79,7 +81,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
@@ -96,7 +98,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${sub}.raw" \
--output_path="data/manifest.${sub}"
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index 305add204..d49d6d6b6 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -1,4 +1,4 @@
-# Tacotron2 with LJSpeech
+# Tacotron2 with LJSpeech
PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
## Dataset
@@ -18,11 +18,15 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
@@ -51,7 +55,7 @@ By default, training will be resumed from the latest checkpoint in `--output`, i
And if you want to resume from an other existing model, you should set `checkpoint_path` to be the checkpoint path you want to load.
**Note: The checkpoint path cannot contain the file extension.**
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
diff --git a/examples/ljspeech/tts0/run.sh b/examples/ljspeech/tts0/run.sh
index 1da80c962..47c76c3d2 100755
--- a/examples/ljspeech/tts0/run.sh
+++ b/examples/ljspeech/tts0/run.sh
@@ -11,7 +11,7 @@ preprocess_path=preprocessed_ljspeech
train_output_path=output
ckpt_name=step-35000
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 8a43ecd9c..c2d0c59e8 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -1,11 +1,9 @@
# TransformerTTS with LJSpeech
## Dataset
-### Download the datasaet
+We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
+
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-```
-### Extract the dataset
-```bash
tar xjvf LJSpeech-1.1.tar.bz2
```
## Get Started
@@ -20,7 +18,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -44,7 +46,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
@@ -77,7 +79,7 @@ optional arguments:
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
5. `--phones-dict` is the path of the phone vocabulary file.
-## Synthesize
+## Synthesizing
We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder.
Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
```bash
diff --git a/examples/ljspeech/tts1/run.sh b/examples/ljspeech/tts1/run.sh
index 6e7a60607..48c4c9151 100755
--- a/examples/ljspeech/tts1/run.sh
+++ b/examples/ljspeech/tts1/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_403.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 5bdaf4b82..bb5c7a69e 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -2,10 +2,10 @@
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
-### Get MFA result of LJSpeech-1.1 and Extract it
+### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
@@ -22,7 +22,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -49,7 +53,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
@@ -85,7 +89,7 @@ optional arguments:
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
5. `--phones-dict` is the path of the phone vocabulary file.
-### Synthesize
+### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
```bash
diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh
index 143debd2a..c64fa8883 100755
--- a/examples/ljspeech/tts3/run.sh
+++ b/examples/ljspeech/tts3/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_201.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md
index 0d4e6c51a..725eb617b 100644
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
@@ -1,11 +1,9 @@
# WaveFlow with LJSpeech
## Dataset
-### Download the datasaet.
+We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
+
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-```
-### Extract the dataset.
-```bash
tar xjvf LJSpeech-1.1.tar.bz2
```
## Get Started
@@ -19,11 +17,15 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset.
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${preprocess_path}
```
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
@@ -35,7 +37,7 @@ The training script requires 4 command line arguments.
If you want distributed training, set a larger `--ngpu` (e.g. 4). Note that distributed training with cpu is not supported yet.
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name}
diff --git a/examples/ljspeech/voc0/run.sh b/examples/ljspeech/voc0/run.sh
index a4f1ac389..ddd82cb44 100755
--- a/examples/ljspeech/voc0/run.sh
+++ b/examples/ljspeech/voc0/run.sh
@@ -13,7 +13,7 @@ train_output_path=output
input_mel_path=../tts0/output/test
ckpt_name=step-10000
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 24f6dbcaf..7cb69b154 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -1,10 +1,10 @@
# Parallel WaveGAN with the LJSpeech-1.1
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
-### Get MFA results for silence trim
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
## Get Started
@@ -19,8 +19,11 @@ Run the command below to
```bash
./run.sh
```
-
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -44,7 +47,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
@@ -91,7 +94,7 @@ benchmark:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@@ -122,7 +125,7 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-## Pretrained Models
+## Pretrained Model
Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
Parallel WaveGAN checkpoint contains files listed below.
diff --git a/examples/ljspeech/voc1/run.sh b/examples/ljspeech/voc1/run.sh
index 163095439..cab1ac38b 100755
--- a/examples/ljspeech/voc1/run.sh
+++ b/examples/ljspeech/voc1/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md
index d58ca5137..2b3f91b52 100644
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
@@ -24,8 +24,11 @@ If you want to use other datasets, you can also download and preprocess it as lo
```bash
./run.sh
```
-
-### Preprocess Datasets
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
`./local/preprocess.sh` calls `${BIN_DIR}/preprocess.py`.
```bash
./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names}
@@ -62,7 +65,7 @@ In `${BIN_DIR}/preprocess.py`:
2. `--output_dir` is the directory to save the preprocessed dataset
3. `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.
-### Train the model
+### Model Training
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
@@ -79,7 +82,7 @@ Other options are described below.
- `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces.
- `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory.
-### Inference
+### Inferencing
When training is done, run the command below to generate utterance embedding for each utterance in a dataset.
`./local/inference.sh` calls `${BIN_DIR}/inference.py`.
```bash
diff --git a/examples/other/ge2e/run.sh b/examples/other/ge2e/run.sh
index d7954bd2f..e69b34a7c 100755
--- a/examples/other/ge2e/run.sh
+++ b/examples/other/ge2e/run.sh
@@ -15,7 +15,7 @@ infer_input=infer_input
infer_output=infer_output
ckpt_name=step-10000
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
index 8a7e10f0a..5a05fa46e 100644
--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
- spm_model_prefix: data/bpe_unigram_8000
+ spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
batch_size: 10
diff --git a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
index 9c1ac91a9..8256f7160 100644
--- a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
@@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
- spm_model_prefix: data/bpe_unigram_8000
+ spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
batch_size: 10
diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
index d3acbd448..fb4efbe35 100755
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -4,19 +4,22 @@ set -e
stage=-1
stop_stage=100
+dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=8000
bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
data_dir=./TED-En-Zh
-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
+mkdir -p ${dict_dir}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@@ -73,11 +76,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--text_keys 'text' 'text1' \
--manifest_paths="data/manifest.train.raw"
-
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
@@ -92,7 +94,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
diff --git a/examples/ted_en_zh/st1/.gitignore b/examples/ted_en_zh/st1/.gitignore
new file mode 100644
index 000000000..123e5174a
--- /dev/null
+++ b/examples/ted_en_zh/st1/.gitignore
@@ -0,0 +1,3 @@
+TED_EnZh
+data
+exp
diff --git a/examples/ted_en_zh/st1/README.md b/examples/ted_en_zh/st1/README.md
new file mode 100644
index 000000000..e8aed53ec
--- /dev/null
+++ b/examples/ted_en_zh/st1/README.md
@@ -0,0 +1,16 @@
+
+# TED En-Zh
+
+## Dataset
+
+| Data Subset | Duration in Frames |
+| --- | --- |
+| data/manifest.train | 94.2 ~ 6000 |
+| data/manifest.dev | 115.1 ~ 3900 |
+| data/manifest.test | 110 ~ 4274.6 |
+
+## Transformer
+| Model | Params | Config | Val loss | Char-BLEU |
+| --- | --- | --- | --- | --- |
+| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 19.45 |
+| FAT + Transformer+ASR MTL with word reward | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 20.80 |
diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml
new file mode 100644
index 000000000..d553bde77
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+ train_manifest: data/manifest.train.tiny
+ dev_manifest: data/manifest.dev
+ test_manifest: data/manifest.test
+ min_input_len: 5.0 # frame
+ max_input_len: 3000.0 # frame
+ min_output_len: 0.0 # tokens
+ max_output_len: 400.0 # tokens
+ min_output_input_ratio: 0.01
+ max_output_input_ratio: 20.0
+
+collator:
+ vocab_filepath: data/lang_char/vocab.txt
+ unit_type: 'spm'
+ spm_model_prefix: data/lang_char/bpe_unigram_8000
+ mean_std_filepath: ""
+ # augmentation_config: conf/augmentation.json
+ batch_size: 10
+ raw_wav: True # use raw_wav or kaldi feature
+ spectrum_type: fbank #linear, mfcc, fbank
+ feat_dim: 83
+ delta_delta: False
+ dither: 1.0
+ target_sample_rate: 16000
+ max_freq: None
+ n_fft: None
+ stride_ms: 10.0
+ window_ms: 25.0
+ use_dB_normalization: True
+ target_dB: -20
+ random_seed: 0
+ keep_transcription_text: False
+ sortagrad: True
+ shuffle_method: batch_shuffle
+ num_workers: 2
+
+
+# network architecture
+model:
+ cmvn_file: None
+ cmvn_file_type: "json"
+ # encoder related
+ encoder: transformer
+ encoder_conf:
+ output_size: 256 # dimension of attention
+ attention_heads: 4
+ linear_units: 2048 # the number of units of position-wise feed forward
+ num_blocks: 12 # the number of encoder blocks
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.0
+ input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+ normalize_before: true
+
+ # decoder related
+ decoder: transformer
+ decoder_conf:
+ attention_heads: 4
+ linear_units: 2048
+ num_blocks: 6
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.0
+ src_attention_dropout_rate: 0.0
+
+ # hybrid CTC/attention
+ model_conf:
+ asr_weight: 0.0
+ ctc_weight: 0.0
+ ctc_dropoutrate: 0.0
+ ctc_grad_norm_type: null
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: false
+
+
+training:
+ n_epoch: 20
+ accum_grad: 2
+ global_grad_clip: 5.0
+ optim: adam
+ optim_conf:
+ lr: 0.004
+ weight_decay: 1e-06
+ scheduler: warmuplr # pytorch v1.1.0+ required
+ scheduler_conf:
+ warmup_steps: 25000
+ lr_decay: 1.0
+ log_interval: 5
+ checkpoint:
+ kbest_n: 50
+ latest_n: 5
+
+
+decoding:
+ batch_size: 5
+ error_rate_type: char-bleu
+ decoding_method: fullsentence # 'fullsentence', 'simultaneous'
+ alpha: 2.5
+ beta: 0.3
+ beam_size: 10
+ word_reward: 0.7
+ cutoff_prob: 1.0
+ cutoff_top_n: 0
+ num_proc_bsearch: 8
+ ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+ decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+ # <0: for decoding, use full chunk.
+ # >0: for decoding, use fixed chunk size as set.
+ # 0: used for training, it's prohibited here.
+ num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
+ simulate_streaming: False # simulate streaming inference. Defaults to False.
diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
new file mode 100644
index 000000000..b4fb51075
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+ train_manifest: data/manifest.train
+ dev_manifest: data/manifest.dev
+ test_manifest: data/manifest.test
+ min_input_len: 5.0 # frame
+ max_input_len: 3000.0 # frame
+ min_output_len: 0.0 # tokens
+ max_output_len: 400.0 # tokens
+ min_output_input_ratio: 0.01
+ max_output_input_ratio: 20.0
+
+collator:
+ vocab_filepath: data/lang_char/vocab.txt
+ unit_type: 'spm'
+ spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
+ mean_std_filepath: ""
+ # augmentation_config: conf/augmentation.json
+ batch_size: 10
+ raw_wav: True # use raw_wav or kaldi feature
+ spectrum_type: fbank #linear, mfcc, fbank
+ feat_dim: 83
+ delta_delta: False
+ dither: 1.0
+ target_sample_rate: 16000
+ max_freq: None
+ n_fft: None
+ stride_ms: 10.0
+ window_ms: 25.0
+ use_dB_normalization: True
+ target_dB: -20
+ random_seed: 0
+ keep_transcription_text: False
+ sortagrad: True
+ shuffle_method: batch_shuffle
+ num_workers: 2
+
+
+# network architecture
+model:
+ cmvn_file: None
+ cmvn_file_type: "json"
+ # encoder related
+ encoder: transformer
+ encoder_conf:
+ output_size: 256 # dimension of attention
+ attention_heads: 4
+ linear_units: 2048 # the number of units of position-wise feed forward
+ num_blocks: 12 # the number of encoder blocks
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.0
+ input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+ normalize_before: true
+
+ # decoder related
+ decoder: transformer
+ decoder_conf:
+ attention_heads: 4
+ linear_units: 2048
+ num_blocks: 6
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.0
+ src_attention_dropout_rate: 0.0
+
+ # hybrid CTC/attention
+ model_conf:
+ asr_weight: 0.5
+ ctc_weight: 0.3
+ ctc_dropoutrate: 0.0
+ ctc_grad_norm_type: null
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: false
+
+
+training:
+ n_epoch: 20
+ accum_grad: 2
+ global_grad_clip: 5.0
+ optim: adam
+ optim_conf:
+ lr: 2.5
+ weight_decay: 1e-06
+ scheduler: noam
+ scheduler_conf:
+ warmup_steps: 25000
+ lr_decay: 1.0
+ log_interval: 5
+ checkpoint:
+ kbest_n: 50
+ latest_n: 5
+
+
+decoding:
+ batch_size: 5
+ error_rate_type: char-bleu
+ decoding_method: fullsentence # 'fullsentence', 'simultaneous'
+ alpha: 2.5
+ beta: 0.3
+ beam_size: 10
+ word_reward: 0.7
+ cutoff_prob: 1.0
+ cutoff_top_n: 0
+ num_proc_bsearch: 8
+ ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+ decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+ # <0: for decoding, use full chunk.
+ # >0: for decoding, use fixed chunk size as set.
+ # 0: used for training, it's prohibited here.
+ num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
+ simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/local/convert_torch_to_paddle.py b/examples/ted_en_zh/st1/local/convert_torch_to_paddle.py
new file mode 100644
index 000000000..4f4bfde8d
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/convert_torch_to_paddle.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+import torch
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+def torch2paddle(args):
+ paddle.set_device('cpu')
+ paddle_model_dict = {}
+ torch_model = torch.load(args.torch_ckpt, map_location='cpu')
+ cnt = 0
+ for k, v in torch_model['model'].items():
+ # encoder.embed.* --> encoder.embed.*
+ if k.startswith('encoder.embed'):
+ if v.ndim == 2:
+ v = v.transpose(0, 1)
+ paddle_model_dict[k] = v.numpy()
+ cnt += 1
+ logger.info(
+ f"Convert torch weight: {k} to paddlepaddle weight: {k}, shape is {v.shape}"
+ )
+
+ # encoder.after_norm.* --> encoder.after_norm.*
+ # encoder.after_norm.* --> decoder.after_norm.*
+ # encoder.after_norm.* --> st_decoder.after_norm.*
+ if k.startswith('encoder.after_norm'):
+ paddle_model_dict[k] = v.numpy()
+ cnt += 1
+ paddle_model_dict[k.replace('en', 'de')] = v.numpy()
+ logger.info(
+ f"Convert torch weight: {k} to paddlepaddle weight: {k.replace('en','de')}, shape is {v.shape}"
+ )
+ paddle_model_dict['st_' + k.replace('en', 'de')] = v.numpy()
+ logger.info(
+ f"Convert torch weight: {k} to paddlepaddle weight: {'st_'+ k.replace('en','de')}, shape is {v.shape}"
+ )
+ cnt += 2
+
+ # encoder.encoders.* --> encoder.encoders.*
+ # encoder.encoders.* (last six layers) --> decoder.encoders.* (first six layers)
+ # encoder.encoders.* (last six layers) --> st_decoder.encoders.* (first six layers)
+ if k.startswith('encoder.encoders'):
+ if v.ndim == 2:
+ v = v.transpose(0, 1)
+ paddle_model_dict[k] = v.numpy()
+ logger.info(
+ f"Convert torch weight: {k} to paddlepaddle weight: {k}, shape is {v.shape}"
+ )
+ cnt += 1
+ origin_k = k
+ k_split = k.split('.')
+ if int(k_split[2]) >= 6:
+ k = k.replace(k_split[2], str(int(k_split[2]) - 6))
+ paddle_model_dict[k.replace('en', 'de')] = v.numpy()
+ logger.info(
+ f"Convert torch weight: {origin_k} to paddlepaddle weight: {k.replace('en','de')}, shape is {v.shape}"
+ )
+ paddle_model_dict['st_' + k.replace('en', 'de')] = v.numpy()
+ logger.info(
+ f"Convert torch weight: {origin_k} to paddlepaddle weight: {'st_'+ k.replace('en','de')}, shape is {v.shape}"
+ )
+ cnt += 2
+ logger.info(f"Convert {cnt} weights totally from torch to paddlepaddle")
+ paddle.save(paddle_model_dict, args.paddle_ckpt)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ '--torch_ckpt',
+ type=str,
+ default='/home/snapshot.ep.98',
+ help="Path to torch checkpoint.")
+ parser.add_argument(
+ '--paddle_ckpt',
+ type=str,
+ default='paddle.98.pdparams',
+ help="Path to save paddlepaddle checkpoint.")
+ args = parser.parse_args()
+ torch2paddle(args)
diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
new file mode 100755
index 000000000..2e9d05d10
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+set -e
+
+stage=-1
+stop_stage=100
+dict_dir=data/lang_char
+
+# bpemode (unigram or bpe)
+nbpe=8000
+bpemode=unigram
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
+data_dir=./TED_EnZh
+
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+mkdir -p data
+mkdir -p ${dict_dir}
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+ if [ ! -e ${data_dir} ]; then
+ echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+ echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
+ echo "The tree of the directory should be:"
+ echo "."
+ echo "|-- En-Zh"
+ echo "|-- test-segment"
+ echo " |-- tst2010"
+ echo " |-- ..."
+ echo "|-- train-split"
+ echo " |-- train-segment"
+ echo "|-- README.md"
+
+ exit 1
+ fi
+
+ # generate manifests
+ python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
+ --manifest_prefix="data/manifest" \
+ --src_dir="${data_dir}"
+
+ echo "Complete raw data pre-process."
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # compute mean and stddev for normalizer
+ num_workers=$(nproc)
+ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+ --manifest_path="data/manifest.train.raw" \
+ --num_samples=-1 \
+ --spectrum_type="fbank" \
+ --feat_dim=80 \
+ --delta_delta=false \
+ --sample_rate=16000 \
+ --stride_ms=10.0 \
+ --window_ms=25.0 \
+ --use_dB_normalization=False \
+ --num_workers=${num_workers} \
+ --output_path="data/mean_std.json"
+
+ if [ $? -ne 0 ]; then
+ echo "Compute mean and stddev failed. Terminated."
+ exit 1
+ fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # build vocabulary
+ python3 ${MAIN_ROOT}/utils/build_vocab.py \
+ --unit_type "spm" \
+ --spm_vocab_size=${nbpe} \
+ --spm_mode ${bpemode} \
+ --spm_model_prefix ${bpeprefix} \
+ --vocab_path="${dict_dir}/vocab.txt" \
+ --text_keys 'text' 'text1' \
+ --manifest_paths="data/manifest.train.raw"
+
+
+ if [ $? -ne 0 ]; then
+ echo "Build vocabulary failed. Terminated."
+ exit 1
+ fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # format manifest with tokenids, vocab size
+ for set in train dev test; do
+ {
+ python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
+ --feat_type "raw" \
+ --cmvn_path "data/mean_std.json" \
+ --unit_type "spm" \
+ --spm_model_prefix ${bpeprefix} \
+ --vocab_path="${dict_dir}/vocab.txt" \
+ --manifest_path="data/manifest.${set}.raw" \
+ --output_path="data/manifest.${set}"
+
+ if [ $? -ne 0 ]; then
+ echo "Formt mnaifest failed. Terminated."
+ exit 1
+ fi
+ }&
+ done
+ wait
+fi
+
+echo "Ted En-Zh Data preparation done."
+exit 0
diff --git a/examples/ted_en_zh/st1/local/download_pretrain.sh b/examples/ted_en_zh/st1/local/download_pretrain.sh
new file mode 100755
index 000000000..1ff05ae33
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/download_pretrain.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# download pytorch weight
+wget https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/snapshot.ep.98 --no-check-certificate
+
+# convert pytorch weight to paddlepaddle
+python local/convert_torch_to_paddle.py \
+--torch_ckpt snapshot.ep.98 \
+--paddle_ckpt paddle.98.pdparams
+
+# Or you can download converted weights
+# wget https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/paddle.98.pdparams --no-check-certificate
+
+if [ $? -ne 0 ]; then
+ echo "Failed in downloading and coverting!"
+ exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh
new file mode 100755
index 000000000..7235c6f9a
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/test.sh
@@ -0,0 +1,31 @@
+#! /usr/bin/env bash
+
+if [ $# != 2 ];then
+ echo "usage: ${0} config_path ckpt_path_prefix"
+ exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+
+for type in fullsentence; do
+ echo "decoding ${type}"
+ batch_size=32
+ python3 -u ${BIN_DIR}/test.py \
+ --nproc ${ngpu} \
+ --config ${config_path} \
+ --result_file ${ckpt_prefix}.${type}.rsl \
+ --checkpoint_path ${ckpt_prefix} \
+ --opts decoding.decoding_method ${type} \
+ --opts decoding.batch_size ${batch_size}
+
+ if [ $? -ne 0 ]; then
+ echo "Failed in evaluation!"
+ exit 1
+ fi
+done
+
+exit 0
diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train_finetune.sh
new file mode 100755
index 000000000..367011217
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/train_finetune.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+ echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+ exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+ckpt_path=$3
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=0
+if [ ${seed} != 0 ]; then
+ export FLAGS_cudnn_deterministic=True
+fi
+
+python3 -u ${BIN_DIR}/train.py \
+--nproc ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--checkpoint_path ${ckpt_path} \
+--seed ${seed}
+
+if [ ${seed} != 0 ]; then
+ unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+ echo "Failed in training!"
+ exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh
new file mode 100644
index 000000000..fd537917a
--- /dev/null
+++ b/examples/ted_en_zh/st1/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=u2_st
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh
new file mode 100755
index 000000000..f8adf4f65
--- /dev/null
+++ b/examples/ted_en_zh/st1/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+set -e
+source path.sh
+
+gpus=0,1,2,3
+stage=1
+stop_stage=4
+conf_path=conf/transformer_mtl_noam.yaml
+ckpt_path=paddle.98
+avg_num=5
+data_path=./TED_EnZh # path to unzipped data
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ bash ./local/data.sh --data_dir ${data_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # download pretrained
+ bash ./local/download_pretrain.sh || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # train model, all `ckpt` under `exp` dir
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path} ${ckpt} ${ckpt_path}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # avg n best model
+ avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # test ckpt avg_n
+ CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
\ No newline at end of file
diff --git a/examples/timit/README.md b/examples/timit/README.md
index 778398748..51fcfd57c 100644
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
@@ -4,4 +4,4 @@ asr model with phone unit
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
-* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
\ No newline at end of file
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml
index 1d18468b8..89ae2fd3d 100644
--- a/examples/timit/asr1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -11,7 +11,7 @@ data:
max_output_input_ratio: 1000.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: "word"
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
diff --git a/examples/timit/asr1/local/data.sh b/examples/timit/asr1/local/data.sh
index e588e48df..fb720932d 100755
--- a/examples/timit/asr1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -3,15 +3,19 @@
stage=-1
stop_stage=100
+dict_dir=data/lang_char
+
unit_type=word
TIMIT_path=
-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
+
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/timit/timit_kaldi_standard_split.py \
@@ -52,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
@@ -68,7 +72,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
index 58899a156..ba453aad7 100644
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -14,7 +14,7 @@ data:
collator:
mean_std_filepath: data/mean_std.json
unit_type: char
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
index 334b1d31c..36c774e37 100644
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -14,7 +14,7 @@ data:
collator:
mean_std_filepath: data/mean_std.json
unit_type: char
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
diff --git a/examples/tiny/asr0/local/data.sh b/examples/tiny/asr0/local/data.sh
index f1fb8cb1d..2a544ef89 100755
--- a/examples/tiny/asr0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -4,10 +4,12 @@ stage=-1
stop_stage=100
unit_type=char
+dict_dir=data/lang_char
-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@@ -51,7 +53,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw"
if [ $? -ne 0 ]; then
@@ -65,7 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny"
diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
index 6bed27f5c..6183a903b 100644
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: ""
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
index 7aed1b193..01d383fb8 100644
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: ""
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
index 2c09b3ae6..a3fee6901 100644
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: ""
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
index 1378e848d..5a87d6d24 100644
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -12,7 +12,7 @@ data:
collator:
mean_std_filepath: data/mean_std.json
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
diff --git a/examples/tiny/asr1/local/data.sh b/examples/tiny/asr1/local/data.sh
index 87539d5ed..1ef9f7768 100755
--- a/examples/tiny/asr1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -3,14 +3,17 @@
stage=-1
stop_stage=100
+dict_dir=data/lang_char
+
# bpemode (unigram or bpe)
nbpe=200
bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
+mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
@@ -57,7 +60,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw"
if [ $? -ne 0 ]; then
@@ -72,7 +75,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
- --vocab_path="data/vocab.txt" \
+ --vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny"
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 894d6b147..aab005735 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -5,7 +5,7 @@ This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2
### Download and Extract the datasaet
Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle/10283/3443).
-### Get MFA result of VCTK and Extract it
+### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
@@ -25,7 +25,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -52,7 +56,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
@@ -87,7 +91,7 @@ optional arguments:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--phones-dict` is the path of the phone vocabulary file.
-### Synthesize
+### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it.
diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh
index 0562ef3f4..a2b849bc8 100755
--- a/examples/vctk/tts3/run.sh
+++ b/examples/vctk/tts3/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_331.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 8692f0104..154fd7cde 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -2,11 +2,11 @@
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443).
## Dataset
-### Download and Extract the datasaet
+### Download and Extract
Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
-### Get MFA results for silence trim
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
1. `p315`, because no txt for it.
@@ -24,7 +24,11 @@ Run the command below to
```bash
./run.sh
```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
```bash
./local/preprocess.sh ${conf_path}
```
@@ -48,7 +52,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
-### Train the model
+### Model Training
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
@@ -95,7 +99,7 @@ benchmark:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-### Synthesize
+### Synthesizing
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@@ -126,7 +130,7 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
-## Pretrained Models
+## Pretrained Model
Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip).
Parallel WaveGAN checkpoint contains files listed below.
diff --git a/examples/vctk/voc1/run.sh b/examples/vctk/voc1/run.sh
index 7d0fdb21e..4f426ea02 100755
--- a/examples/vctk/voc1/run.sh
+++ b/examples/vctk/voc1/run.sh
@@ -11,7 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
-# with the following command, you can choice the stage range you want to run
+# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md
index 0cb0f354c..cbd01eb8c 100644
--- a/examples/wenetspeech/README.md
+++ b/examples/wenetspeech/README.md
@@ -55,4 +55,4 @@ As shown in the following table, we provide 3 training subsets, namely `S`, `M`
|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training |
| TEST\_NET | 23 | Internet | Match test |
-| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset |
\ No newline at end of file
+| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset |
diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md
index 5aff041f8..5c2b8143c 100644
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -21,4 +21,4 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 |
-| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
\ No newline at end of file
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
index 0340dc85d..a3a42ec63 100644
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -51,7 +51,7 @@ data:
max_output_input_ratio: 10.0
collator:
- vocab_filepath: data/vocab.txt
+ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
index 4de0b7d45..0e1b27278 100644
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -1,6 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
-
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@@ -12,11 +24,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-import sys
-import os
import argparse
import json
+import os
+import sys
def get_args():
@@ -85,13 +96,13 @@ def meta_analysis(input_json, output_dir):
else:
utt2text.write(f'{sid}\t{text}\n')
segments.write(
- f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
- )
+ f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
utt2dur.write(f'{sid}\t{dur}\n')
segment_sub_names = " ".join(segment_subsets)
utt2subsets.write(
f'{sid}\t{segment_sub_names}\n')
+
def main():
args = get_args()
@@ -99,4 +110,4 @@ def main():
if __name__ == '__main__':
- main()
\ No newline at end of file
+ main()
diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py
index 603e0082c..f1b9287ed 100644
--- a/examples/wenetspeech/asr1/local/process_opus.py
+++ b/examples/wenetspeech/asr1/local/process_opus.py
@@ -1,5 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao)
-
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@@ -11,14 +23,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
# process_opus.py: segmentation and downsampling of opus audio
-
# usage: python3 process_opus.py wav.scp segments output_wav.scp
+import os
+import sys
from pydub import AudioSegment
-import sys
-import os
def read_file(wav_scp, segments):
@@ -86,4 +96,4 @@ def main():
if __name__ == '__main__':
- main()
\ No newline at end of file
+ main()
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 177d710b0..e827414d3 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -409,7 +409,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
@paddle.no_grad()
def test(self):
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
from paddlespeech.s2t.utils.log import Autolog
self.autolog = Autolog(
batch_size=self.config.decoding.batch_size,
@@ -438,7 +438,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg)
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
self.autolog.report()
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
@@ -512,7 +512,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
x_len_list = np.split(x_len_batch, batch_size, axis=0)
for x, x_len in zip(x_list, x_len_list):
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
self.autolog.times.start()
x_len = x_len[0]
assert (chunk_size <= x_len)
@@ -547,7 +547,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
probs_chunk_list = []
probs_chunk_lens_list = []
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
# record the model preprocessing time
self.autolog.times.stamp()
@@ -606,7 +606,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
[output_probs, output_probs_padding], axis=1)
output_probs_list.append(output_probs)
output_lens_list.append(output_lens)
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
# record the model inference time
self.autolog.times.stamp()
# record the post processing time
@@ -641,12 +641,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
audio_len_handle.reshape(x_len.shape)
audio_len_handle.copy_from_cpu(x_len)
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
self.autolog.times.start()
# record the prefix processing time
self.autolog.times.stamp()
self.predictor.run()
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
# record the model inference time
self.autolog.times.stamp()
# record the post processing time
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 9f5448ccf..27bc47d2b 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -24,15 +24,10 @@ import jsonlines
import numpy as np
import paddle
from paddle import distributed as dist
-from paddle.io import DataLoader
from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
-from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataloader import BatchDataLoader
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.io.sampler import SortagradBatchSampler
-from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
from paddlespeech.s2t.models.u2 import U2Model
from paddlespeech.s2t.training.optimizer import OptimizerFactory
from paddlespeech.s2t.training.reporter import ObsScope
@@ -215,7 +210,7 @@ class U2Trainer(Trainer):
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += f" {k.split(',')[1]}" if len(
- k.split(',')) == 2 else f""
+ k.split(',')) == 2 else ""
msg += ","
msg = msg[:-1] # remove the last ","
if (batch_index + 1
diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
index 21f512e9b..812be6e41 100644
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -56,6 +56,8 @@ class TextFeaturizer():
self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
vocab_filepath, maskctc)
self.vocab_size = len(self.vocab_list)
+ else:
+ logger.warning("TextFeaturizer: not have vocab file.")
if unit_type == 'spm':
spm_model = spm_model_prefix + '.model'
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index da91ef921..f35adef0c 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -341,7 +341,7 @@ class LogMelSpectrogramKaldi():
self.eps = eps
self.remove_dc_offset = True
self.preemph = 0.97
- self.dither = dither
+ self.dither = dither # only work in train mode
def __repr__(self):
return (
@@ -361,11 +361,12 @@ class LogMelSpectrogramKaldi():
eps=self.eps,
dither=self.dither, ))
- def __call__(self, x):
+ def __call__(self, x, train):
"""
Args:
x (np.ndarray): shape (Ti,)
+ train (bool): True, train mode.
Raises:
ValueError: not support (Ti, C)
@@ -373,6 +374,7 @@ class LogMelSpectrogramKaldi():
Returns:
np.ndarray: (T, D)
"""
+ dither = self.dither if train else False
if x.ndim != 1:
raise ValueError("Not support x: [Time, Channel]")
@@ -391,7 +393,7 @@ class LogMelSpectrogramKaldi():
nfft=self.n_fft,
lowfreq=self.fmin,
highfreq=self.fmax,
- dither=self.dither,
+ dither=dither,
remove_dc_offset=self.remove_dc_offset,
preemph=self.preemph,
wintype=self.window)