From 0c3c21830593d6dbc13b406f51186fff9a8830d4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 29 Oct 2021 09:27:35 +0000 Subject: [PATCH 1/2] fix demos --- docs/source/_static/custom.css | 5 + docs/source/conf.py | 3 + docs/source/tts/demo.rst | 490 +++++++++++++++++++++++++-------- 3 files changed, 387 insertions(+), 111 deletions(-) create mode 100644 docs/source/_static/custom.css diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css new file mode 100644 index 00000000..2987ae04 --- /dev/null +++ b/docs/source/_static/custom.css @@ -0,0 +1,5 @@ +.wy-nav-content { + max-width: 80%; +} +.table table{ background:#b9b9b9} +.table table td{ background:#FFF} diff --git a/docs/source/conf.py b/docs/source/conf.py index c41884ef..f2f75ce3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -79,6 +79,9 @@ smartquotes = False # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] html_logo = '../images/paddle.png' +html_css_files = [ + 'custom.css', +] # -- Extension configuration ------------------------------------------------- # numpydoc_show_class_members = False diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index 948fc056..20208d2c 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -27,74 +27,106 @@ Analysis/synthesis Audio samples generated from ground-truth spectrograms with a vocoder. .. raw:: html - + LJSpeech(English)

- + +
+
- - + + + + + + + + + + + + + - + + + + + + + + + +
GT WaveFlow Text GT WaveFlow
Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition + + + +
in being comparatively modern. - + + +
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process +
produced the block books, which were the immediate predecessors of the true printed book + +
the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing. +
- +
+
+ + +Multi-Speaker TTS +------------------- + +PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset. + +.. raw:: html + +
+ + + + + + +
Text Origin Generated
+
+
+
+ + +Duration control in FastSpeech2 +-------------------------------------- +In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce. + +The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios. + +The duration control in FastSpeech2 can control the speed of audios will keep the pitch. (in some speech tool, increase the speed will increase the pitch, and vice versa.) + +.. raw:: html + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Speed(0.8x) Speed(1x) Speed(1.2x)
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+
+
+
+ Chinese TTS with/without text frontend -------------------------------------- @@ -650,9 +905,9 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
- - - + + + @@ -846,6 +1101,8 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
Text With Text Frontend Without Text Frontend Text With Text Frontend Without Text Frontend
他只是一个纸老虎。
- + +
+
\ No newline at end of file diff --git a/docs/source/tts/demo_2.rst b/docs/source/tts/demo_2.rst index 37922fcb..2f0ca7cd 100644 --- a/docs/source/tts/demo_2.rst +++ b/docs/source/tts/demo_2.rst @@ -5,3 +5,283 @@ This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use t We use Espnet's released models here. FastSpeech2 + Parallel WaveGAN in CSMSC + +.. raw:: html + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Text Espent TTS PaddleSpeech TTS
早上好,今天是2020/10/29,最低温度是-3°C。 + + + +
你好,我的编号是37249,很高兴为您服务。 + + + +
我们公司有37249个人。 + + + +
我出生于2005年10月8日。 + + + +
我们习惯在12:30吃中午饭。 + + + +
只要有超过3/4的人投票同意,你就会成为我们的新班长。 + + + +
我要买一只价值999.9元的手表。 + + + +
我的手机号是18544139121,欢迎来电。 + + + +
明天有62%的概率降雨。 + + + +
手表厂有五种好产品。 + + + +
跑马场有五百匹很勇敢的千里马。 + + + +
有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。 + + + +
史小姐拿着小雨伞去找她的老保姆了。 + + + +
不要相信这个老奶奶说的话,她一点儿也不好。 + + + +
+
+ diff --git a/docs/source/tts/test_sentence.txt b/docs/source/tts/test_sentence.txt new file mode 100644 index 00000000..933f4749 --- /dev/null +++ b/docs/source/tts/test_sentence.txt @@ -0,0 +1,14 @@ +001 早上好,今天是2020/10/29,最低温度是-3°C。 +002 你好,我的编号是37249,很高兴为您服务。 +003 我们公司有37249个人。 +004 我出生于2005年10月8日。 +005 我们习惯在12:30吃中午饭。 +006 只要有超过3/4的人投票同意,你就会成为我们的新班长。 +007 我要买一只价值999.9元的手表。 +008 我的手机号是18544139121,欢迎来电。 +009 明天有62%的概率降雨。 +010 手表厂有五种好产品。 +011 跑马场有五百匹很勇敢的千里马。 +012 有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。 +013 史小姐拿着小雨伞去找她的老保姆了。 +014 不要相信这个老奶奶说的话,她一点儿也不好。 \ No newline at end of file diff --git a/parakeet/models/fastspeech2/fastspeech2.py b/parakeet/models/fastspeech2/fastspeech2.py index 7c0e20bc..bde3a82b 100644 --- a/parakeet/models/fastspeech2/fastspeech2.py +++ b/parakeet/models/fastspeech2/fastspeech2.py @@ -420,9 +420,18 @@ class FastSpeech2(nn.Layer): if is_inference: # (B, Tmax) - d_outs = self.duration_predictor.inference(hs, d_masks) + if ds is not None: + d_outs = ds + else: + d_outs = self.duration_predictor.inference(hs, d_masks) + if ps is not None: + p_outs = ps + if es is not None: + e_outs = es + # use prediction in inference # (B, Tmax, 1) + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( (0, 2, 1)) e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( @@ -513,7 +522,7 @@ class FastSpeech2(nn.Layer): x = paddle.cast(text, 'int64') y = speech spemb = spembs - if durations: + if durations is not None: d = paddle.cast(durations, 'int64') p, e = pitch, energy # setup batch axis @@ -531,9 +540,12 @@ class FastSpeech2(nn.Layer): if use_teacher_forcing: # use groundtruth of duration, pitch, and energy - ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0) + ds = d.unsqueeze(0) if d is not None else None + ps = p.unsqueeze(0) if p is not None else None + es = e.unsqueeze(0) if e is not None else None + # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0) # (1, L, odim) - _, outs, *_ = self._forward( + _, outs, d_outs, *_ = self._forward( xs, ilens, ys, @@ -542,10 +554,11 @@ class FastSpeech2(nn.Layer): es=es, spembs=spembs, spk_id=spk_id, - tone_id=tone_id) + tone_id=tone_id, + is_inference=True) else: # (1, L, odim) - _, outs, *_ = self._forward( + _, outs, d_outs, *_ = self._forward( xs, ilens, ys,