From 9ccef7fa04d615df374390140fcfeaf700f36494 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 29 Oct 2021 12:17:10 +0000 Subject: [PATCH] add paddle tts vs espnet tts demos --- docs/source/_static/custom.css | 2 +- docs/source/tts/demo.rst | 269 +++++++++++++++++++- docs/source/tts/demo_2.rst | 280 +++++++++++++++++++++ docs/source/tts/test_sentence.txt | 14 ++ parakeet/models/fastspeech2/fastspeech2.py | 25 +- 5 files changed, 577 insertions(+), 13 deletions(-) create mode 100644 docs/source/tts/test_sentence.txt diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css index 2987ae04..bb65c51a 100644 --- a/docs/source/_static/custom.css +++ b/docs/source/_static/custom.css @@ -2,4 +2,4 @@ max-width: 80%; } .table table{ background:#b9b9b9} -.table table td{ background:#FFF} +.table table td{ background:#FFF; } diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index 20208d2c..09c4d25a 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -248,7 +248,8 @@ Audio samples generated from ground-truth spectrograms with a vocoder. - +
+
TTS ------------------- @@ -633,10 +634,264 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog - +
+
+ + +Multi-Speaker TTS +------------------- + +PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset. + +.. raw:: html + +
+ + + + + + +
Text Origin Generated
+
+
+
+ + +Duration control in FastSpeech2 +-------------------------------------- +In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce. + +The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios. + +The duration control in FastSpeech2 can control the speed of audios will keep the pitch. (in some speech tool, increase the speed will increase the pitch, and vice versa.) + +.. raw:: html + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Speed(0.8x) Speed(1x) Speed(1.2x)
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+
+
+
+ Chinese TTS with/without text frontend -------------------------------------- @@ -650,9 +905,9 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
- - - + + + @@ -846,6 +1101,8 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
Text With Text Frontend Without Text Frontend Text With Text Frontend Without Text Frontend
他只是一个纸老虎。
- + +
+
\ No newline at end of file diff --git a/docs/source/tts/demo_2.rst b/docs/source/tts/demo_2.rst index 37922fcb..2f0ca7cd 100644 --- a/docs/source/tts/demo_2.rst +++ b/docs/source/tts/demo_2.rst @@ -5,3 +5,283 @@ This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use t We use Espnet's released models here. FastSpeech2 + Parallel WaveGAN in CSMSC + +.. raw:: html + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Text Espent TTS PaddleSpeech TTS
早上好,今天是2020/10/29,最低温度是-3°C。 + + + +
你好,我的编号是37249,很高兴为您服务。 + + + +
我们公司有37249个人。 + + + +
我出生于2005年10月8日。 + + + +
我们习惯在12:30吃中午饭。 + + + +
只要有超过3/4的人投票同意,你就会成为我们的新班长。 + + + +
我要买一只价值999.9元的手表。 + + + +
我的手机号是18544139121,欢迎来电。 + + + +
明天有62%的概率降雨。 + + + +
手表厂有五种好产品。 + + + +
跑马场有五百匹很勇敢的千里马。 + + + +
有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。 + + + +
史小姐拿着小雨伞去找她的老保姆了。 + + + +
不要相信这个老奶奶说的话,她一点儿也不好。 + + + +
+
+ diff --git a/docs/source/tts/test_sentence.txt b/docs/source/tts/test_sentence.txt new file mode 100644 index 00000000..933f4749 --- /dev/null +++ b/docs/source/tts/test_sentence.txt @@ -0,0 +1,14 @@ +001 早上好,今天是2020/10/29,最低温度是-3°C。 +002 你好,我的编号是37249,很高兴为您服务。 +003 我们公司有37249个人。 +004 我出生于2005年10月8日。 +005 我们习惯在12:30吃中午饭。 +006 只要有超过3/4的人投票同意,你就会成为我们的新班长。 +007 我要买一只价值999.9元的手表。 +008 我的手机号是18544139121,欢迎来电。 +009 明天有62%的概率降雨。 +010 手表厂有五种好产品。 +011 跑马场有五百匹很勇敢的千里马。 +012 有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。 +013 史小姐拿着小雨伞去找她的老保姆了。 +014 不要相信这个老奶奶说的话,她一点儿也不好。 \ No newline at end of file diff --git a/parakeet/models/fastspeech2/fastspeech2.py b/parakeet/models/fastspeech2/fastspeech2.py index 7c0e20bc..bde3a82b 100644 --- a/parakeet/models/fastspeech2/fastspeech2.py +++ b/parakeet/models/fastspeech2/fastspeech2.py @@ -420,9 +420,18 @@ class FastSpeech2(nn.Layer): if is_inference: # (B, Tmax) - d_outs = self.duration_predictor.inference(hs, d_masks) + if ds is not None: + d_outs = ds + else: + d_outs = self.duration_predictor.inference(hs, d_masks) + if ps is not None: + p_outs = ps + if es is not None: + e_outs = es + # use prediction in inference # (B, Tmax, 1) + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( (0, 2, 1)) e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( @@ -513,7 +522,7 @@ class FastSpeech2(nn.Layer): x = paddle.cast(text, 'int64') y = speech spemb = spembs - if durations: + if durations is not None: d = paddle.cast(durations, 'int64') p, e = pitch, energy # setup batch axis @@ -531,9 +540,12 @@ class FastSpeech2(nn.Layer): if use_teacher_forcing: # use groundtruth of duration, pitch, and energy - ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0) + ds = d.unsqueeze(0) if d is not None else None + ps = p.unsqueeze(0) if p is not None else None + es = e.unsqueeze(0) if e is not None else None + # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0) # (1, L, odim) - _, outs, *_ = self._forward( + _, outs, d_outs, *_ = self._forward( xs, ilens, ys, @@ -542,10 +554,11 @@ class FastSpeech2(nn.Layer): es=es, spembs=spembs, spk_id=spk_id, - tone_id=tone_id) + tone_id=tone_id, + is_inference=True) else: # (1, L, odim) - _, outs, *_ = self._forward( + _, outs, d_outs, *_ = self._forward( xs, ilens, ys,