diff --git a/README.md b/README.md index f17cec13..59c61f77 100644 --- a/README.md +++ b/README.md @@ -888,7 +888,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P

## Acknowledgement -- Many thanks to [HighCWu](https://github.com/HighCWu)for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. +- Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. - Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. diff --git a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh index b33e8ca0..77b353b5 100755 --- a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh +++ b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh @@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ - --wav_path=source/SSB03540307.wav\ - --old_str='请播放歌曲小苹果。' \ - --new_str='歌曲真好听。' \ + --wav_path=source/SSB03540307.wav \ + --old_str='请播放歌曲小苹果' \ + --new_str='歌曲真好听' \ --source_lang=zh \ --target_lang=zh \ --erniesat_config=${config_path} \ diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md index 777bea32..a849488d 100644 --- a/examples/aishell3_vctk/ernie_sat/README.md +++ b/examples/aishell3_vctk/ernie_sat/README.md @@ -29,9 +29,11 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd Assume the paths to the datasets are: - `~/datasets/data_aishell3` - `~/datasets/VCTK-Corpus-0.92` + Assume the path to the MFA results of the datasets are: - `./aishell3_alignment_tone` - `./vctk_alignment` + Run the command below to 1. **source path**. 2. preprocess the dataset. diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh index c30af6e8..446ac879 100755 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh @@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ --wav_path=source/p243_313.wav \ - --old_str='For that reason cover should not be given.' \ + --old_str='For that reason cover should not be given' \ --new_str='今天天气很好' \ --source_lang=en \ --target_lang=zh \ @@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ --wav_path=source/SSB03540307.wav \ - --old_str='请播放歌曲小苹果。' \ - --new_str="Thank you!" \ + --old_str='请播放歌曲小苹果' \ + --new_str="Thank you" \ --source_lang=zh \ --target_lang=en \ --erniesat_config=${config_path} \ diff --git a/examples/vctk/ernie_sat/local/synthesize_e2e.sh b/examples/vctk/ernie_sat/local/synthesize_e2e.sh index fee54016..dcc71044 100755 --- a/examples/vctk/ernie_sat/local/synthesize_e2e.sh +++ b/examples/vctk/ernie_sat/local/synthesize_e2e.sh @@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ --wav_path=source/p243_313.wav \ - --old_str='For that reason cover should not be given.' \ + --old_str='For that reason cover should not be given' \ --new_str='I love you very much do you love me' \ --source_lang=en \ --target_lang=en \ @@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=edit \ --wav_path=source/p243_313.wav \ - --old_str='For that reason cover should not be given.' \ - --new_str='For that reason cover is not impossible to be given.' \ + --old_str='For that reason cover should not be given' \ + --new_str='For that reason cover is not impossible to be given' \ --source_lang=en \ --target_lang=en \ --erniesat_config=${config_path} \ diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md index 26c95aca..7fe759eb 100644 --- a/examples/voxceleb/sv0/README.md +++ b/examples/voxceleb/sv0/README.md @@ -148,4 +148,4 @@ source path.sh CUDA_VISIBLE_DEVICES= bash ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_1/model/ conf/ecapa_tdnn.yaml ``` -The performance of the released models are shown in [this](./RESULTS.md) +The performance of the released models are shown in [this](./RESULT.md) diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index cc209db7..af84a5f6 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -34,3 +34,15 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | + + +## Conformer Steaming Pretrained Model + +Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | 16 | 0.056273 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 | diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 813e1e52..8a984949 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 92990048..2d236743 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -86,7 +86,7 @@ class MultiHeadedAttention(nn.Layer): self, value: paddle.Tensor, scores: paddle.Tensor, - mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool) ) -> paddle.Tensor: """Compute attention context vector. Args: @@ -127,15 +127,14 @@ class MultiHeadedAttention(nn.Layer): return self.linear_out(x) # (batch, time1, d_model) - def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) - pos_emb: paddle.Tensor, # paddle.empty([0]) - cache: paddle.Tensor # paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + pos_emb: paddle.Tensor=paddle.empty([0]), + cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute scaled dot product attention. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). @@ -244,15 +243,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): return x - def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) - pos_emb: paddle.Tensor, # paddle.empty([0]) - cache: paddle.Tensor # paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + pos_emb: paddle.Tensor=paddle.empty([0]), + cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index b35fea5b..be605654 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -108,8 +108,8 @@ class ConvolutionModule(nn.Layer): def forward( self, x: paddle.Tensor, - mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) - cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index c8843b72..37b124e8 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer): if self.concat_after: tgt_concat = paddle.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, - paddle.empty([0]), - paddle.zeros([0, 0, 0, 0]))[0]), - dim=-1) + (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) x = residual + self.concat_linear1(tgt_concat) else: x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, - paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[ - 0]) + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) if not self.normalize_before: x = self.norm1(x) @@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer): x = self.norm2(x) if self.concat_after: x_concat = paddle.cat( - (x, self.src_attn(x, memory, memory, memory_mask, - paddle.empty([0]), - paddle.zeros([0, 0, 0, 0]))[0]), - dim=-1) + (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) x = residual + self.concat_linear2(x_concat) else: x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask, - paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0]) + self.src_attn(x, memory, memory, memory_mask)[0]) if not self.normalize_before: x = self.norm2(x) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index cf4e32fa..2f4ad1b2 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -175,9 +175,7 @@ class BaseEncoder(nn.Layer): decoding_chunk_size, self.static_chunk_size, num_decoding_left_chunks) for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad, - paddle.zeros([0, 0, 0, 0]), - paddle.zeros([0, 0, 0, 0])) + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just @@ -190,9 +188,9 @@ class BaseEncoder(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]), - att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk Args: @@ -255,7 +253,6 @@ class BaseEncoder(nn.Layer): xs, att_mask, pos_emb, - mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool), att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, cnn_cache=cnn_cache[i:i + 1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, ) @@ -328,8 +325,7 @@ class BaseEncoder(nn.Layer): chunk_xs = xs[:, cur:end, :] (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache, - paddle.ones([0, 0, 0], dtype=paddle.bool)) + chunk_xs, offset, required_cache_size, att_cache, cnn_cache) outputs.append(y) offset += y.shape[1] diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 4555b535..dac62bce 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle. - Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) - att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: @@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer): if self.normalize_before: x = self.norm1(x) - x_att, new_att_cache = self.self_attn( - x, x, x, mask, paddle.empty([0]), cache=att_cache) + x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache) if self.concat_after: x_concat = paddle.concat((x, x_att), axis=-1) @@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool) - att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index a7eb9892..4a69d78a 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -19,6 +19,10 @@ from pathlib import Path import paddle from paddle import distributed as dist +world_size = dist.get_world_size() +if world_size > 1: + dist.init_parallel_env() + from visualdl import LogWriter from paddlespeech.s2t.training.reporter import ObsScope @@ -122,9 +126,6 @@ class Trainer(): else: raise Exception("invalid device") - if self.parallel: - self.init_parallel() - self.checkpoint = Checkpoint( kbest_n=self.config.checkpoint.kbest_n, latest_n=self.config.checkpoint.latest_n) @@ -173,11 +174,6 @@ class Trainer(): """ return self.args.ngpu > 1 - def init_parallel(self): - """Init environment for multiprocess training. - """ - dist.init_parallel_env() - @mp_tools.rank_zero_only def save(self, tag=None, infos: dict=None): """Save checkpoint (model parameters and optimizer states). diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 87d88ee6..5782d703 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -480,8 +480,7 @@ class PaddleASRConnectionHanddler: self.offset, required_cache_size, att_cache=self.att_cache, - cnn_cache=self.cnn_cache, - att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool)) + cnn_cache=self.cnn_cache) outputs.append(y) # update the global offset, in decoding frame unit diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py index 464f51a3..8dbe685f 100755 --- a/paddlespeech/t2s/exps/ernie_sat/align.py +++ b/paddlespeech/t2s/exps/ernie_sat/align.py @@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300): durations[-2] += durations[-1] durations = durations[:-1] - # replace ' and 'sil' with 'sp' + # replace '' and 'sil' with 'sp' phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones] if lang == 'en': @@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'): wrd = wrd.upper() if (wrd not in ds): wrd2phns[str(index) + '_' + wrd] = 'spn' - phns.extend('spn') + phns.extend(['spn']) else: wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split() phns.extend(word2phns_dict[wrd].split()) diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index 21c9ae04..e450aa1a 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str, new_wav = np.concatenate( [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]]) - # 音频是正常遮住了 - sf.write(str("mask_wav.wav"), new_wav, samplerate=fs) - # 4. get old and new mel span to be mask old_span_bdy = get_span_bdy( mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl) @@ -274,7 +271,8 @@ def get_wav(wav_path: str, new_str: str='', duration_adjust: bool=True, fs: int=24000, - n_shift: int=300): + n_shift: int=300, + task_name: str='synthesize'): outs = get_mlm_output( wav_path=wav_path, @@ -298,9 +296,11 @@ def get_wav(wav_path: str, alt_wav = np.squeeze(alt_wav) old_time_bdy = [n_shift * x for x in old_span_bdy] - wav_replaced = np.concatenate( - [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) - + if task_name == 'edit': + wav_replaced = np.concatenate( + [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) + else: + wav_replaced = alt_wav wav_dict = {"origin": wav_org, "output": wav_replaced} return wav_dict @@ -356,7 +356,11 @@ def parse_args(): "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") # ernie sat related - parser.add_argument("--task_name", type=str, help="task name") + parser.add_argument( + "--task_name", + type=str, + choices=['edit', 'synthesize'], + help="task name.") parser.add_argument("--wav_path", type=str, help="path of old wav") parser.add_argument("--old_str", type=str, help="old string") parser.add_argument("--new_str", type=str, help="new string") @@ -410,10 +414,9 @@ if __name__ == '__main__': if args.task_name == 'edit': new_str = new_str elif args.task_name == 'synthesize': - new_str = old_str + new_str + new_str = old_str + ' ' + new_str else: - new_str = old_str + new_str - print("new_str:", new_str) + new_str = old_str + ' ' + new_str # Extractor mel_extractor = LogMelFBank( @@ -467,7 +470,8 @@ if __name__ == '__main__': new_str=new_str, duration_adjust=args.duration_adjust, fs=erniesat_config.fs, - n_shift=erniesat_config.n_shift) + n_shift=erniesat_config.n_shift, + task_name=args.task_name) sf.write( args.output_name, wav_dict['output'], samplerate=erniesat_config.fs) diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh old mode 100644 new mode 100755 index 2a227281..cb05a1d0 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -15,6 +15,7 @@ dataline=$(cat ${FILENAME}) # parser params IFS=$'\n' lines=(${dataline}) +python=python # The training params model_name=$(func_parser_value "${lines[1]}") @@ -68,7 +69,7 @@ if [[ ${MODE} = "benchmark_train" ]];then if [[ ${model_name} == "pwgan" ]]; then # 下载 csmsc 数据集并解压缩 - wget -nc https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar + wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/BZNSYP.rar mkdir -p BZNSYP unrar x BZNSYP.rar BZNSYP wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt @@ -80,6 +81,10 @@ if [[ ${MODE} = "benchmark_train" ]];then python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy fi + echo "barrier start" + PYTHON="${python}" bash test_tipc/barrier.sh + echo "barrier end" + if [[ ${model_name} == "mdtc" ]]; then # 下载 Snips 数据集并解压缩 wget https://paddlespeech.bj.bcebos.com/datasets/hey_snips_kws_4.0.tar.gz.1