diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css
index 2987ae048..bb65c51a9 100644
--- a/docs/source/_static/custom.css
+++ b/docs/source/_static/custom.css
@@ -2,4 +2,4 @@
max-width: 80%;
}
.table table{ background:#b9b9b9}
-.table table td{ background:#FFF}
+.table table td{ background:#FFF; }
diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst
index 20208d2ca..09c4d25ad 100644
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -248,7 +248,8 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
-
+
+
TTS
-------------------
@@ -633,10 +634,264 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
-
+
+
+
+
+Multi-Speaker TTS
+-------------------
+
+PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset.
+
+.. raw:: html
+
+
+
+
+
Text
+
Origin
+
Generated
+
+
+
+
+
+
+
+Duration control in FastSpeech2
+--------------------------------------
+In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce.
+
+The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios.
+
+The duration control in FastSpeech2 can control the speed of audios will keep the pitch. (in some speech tool, increase the speed will increase the pitch, and vice versa.)
+
+.. raw:: html
+
+
+
+
+
Speed(0.8x)
+
Speed(1x)
+
Speed(1.2x)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Chinese TTS with/without text frontend
--------------------------------------
@@ -650,9 +905,9 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
-
Text
-
With Text Frontend
-
Without Text Frontend
+
Text
+
With Text Frontend
+
Without Text Frontend
他只是一个纸老虎。
@@ -846,6 +1101,8 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
-
+
+
+
\ No newline at end of file
diff --git a/docs/source/tts/demo_2.rst b/docs/source/tts/demo_2.rst
index 37922fcbf..2f0ca7cdb 100644
--- a/docs/source/tts/demo_2.rst
+++ b/docs/source/tts/demo_2.rst
@@ -5,3 +5,283 @@ This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use t
We use Espnet's released models here.
FastSpeech2 + Parallel WaveGAN in CSMSC
+
+.. raw:: html
+
+
+
+
+
+
Text
+
Espent TTS
+
PaddleSpeech TTS
+
+
+
早上好,今天是2020/10/29,最低温度是-3°C。
+
+
+
+
+
+
+
+
+
你好,我的编号是37249,很高兴为您服务。
+
+
+
+
+
+
+
+
+
我们公司有37249个人。
+
+
+
+
+
+
+
+
+
我出生于2005年10月8日。
+
+
+
+
+
+
+
+
+
我们习惯在12:30吃中午饭。
+
+
+
+
+
+
+
+
+
只要有超过3/4的人投票同意,你就会成为我们的新班长。
+
+
+
+
+
+
+
+
+
我要买一只价值999.9元的手表。
+
+
+
+
+
+
+
+
+
我的手机号是18544139121,欢迎来电。
+
+
+
+
+
+
+
+
+
明天有62%的概率降雨。
+
+
+
+
+
+
+
+
+
手表厂有五种好产品。
+
+
+
+
+
+
+
+
+
跑马场有五百匹很勇敢的千里马。
+
+
+
+
+
+
+
+
+
有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。
+
+
+
+
+
+
+
+
+
史小姐拿着小雨伞去找她的老保姆了。
+
+
+
+
+
+
+
+
+
不要相信这个老奶奶说的话,她一点儿也不好。
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/source/tts/test_sentence.txt b/docs/source/tts/test_sentence.txt
new file mode 100644
index 000000000..933f47491
--- /dev/null
+++ b/docs/source/tts/test_sentence.txt
@@ -0,0 +1,14 @@
+001 早上好,今天是2020/10/29,最低温度是-3°C。
+002 你好,我的编号是37249,很高兴为您服务。
+003 我们公司有37249个人。
+004 我出生于2005年10月8日。
+005 我们习惯在12:30吃中午饭。
+006 只要有超过3/4的人投票同意,你就会成为我们的新班长。
+007 我要买一只价值999.9元的手表。
+008 我的手机号是18544139121,欢迎来电。
+009 明天有62%的概率降雨。
+010 手表厂有五种好产品。
+011 跑马场有五百匹很勇敢的千里马。
+012 有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。
+013 史小姐拿着小雨伞去找她的老保姆了。
+014 不要相信这个老奶奶说的话,她一点儿也不好。
\ No newline at end of file
diff --git a/parakeet/models/fastspeech2/fastspeech2.py b/parakeet/models/fastspeech2/fastspeech2.py
index 7c0e20bc2..bde3a82ba 100644
--- a/parakeet/models/fastspeech2/fastspeech2.py
+++ b/parakeet/models/fastspeech2/fastspeech2.py
@@ -420,9 +420,18 @@ class FastSpeech2(nn.Layer):
if is_inference:
# (B, Tmax)
- d_outs = self.duration_predictor.inference(hs, d_masks)
+ if ds is not None:
+ d_outs = ds
+ else:
+ d_outs = self.duration_predictor.inference(hs, d_masks)
+ if ps is not None:
+ p_outs = ps
+ if es is not None:
+ e_outs = es
+
# use prediction in inference
# (B, Tmax, 1)
+
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
@@ -513,7 +522,7 @@ class FastSpeech2(nn.Layer):
x = paddle.cast(text, 'int64')
y = speech
spemb = spembs
- if durations:
+ if durations is not None:
d = paddle.cast(durations, 'int64')
p, e = pitch, energy
# setup batch axis
@@ -531,9 +540,12 @@ class FastSpeech2(nn.Layer):
if use_teacher_forcing:
# use groundtruth of duration, pitch, and energy
- ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0)
+ ds = d.unsqueeze(0) if d is not None else None
+ ps = p.unsqueeze(0) if p is not None else None
+ es = e.unsqueeze(0) if e is not None else None
+ # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
# (1, L, odim)
- _, outs, *_ = self._forward(
+ _, outs, d_outs, *_ = self._forward(
xs,
ilens,
ys,
@@ -542,10 +554,11 @@ class FastSpeech2(nn.Layer):
es=es,
spembs=spembs,
spk_id=spk_id,
- tone_id=tone_id)
+ tone_id=tone_id,
+ is_inference=True)
else:
# (1, L, odim)
- _, outs, *_ = self._forward(
+ _, outs, d_outs, *_ = self._forward(
xs,
ilens,
ys,