Merge pull request #956 from yt605155624/fix_docs

[TTS]add tts demo in read the docs
pull/961/head
Hui Zhang 3 years ago committed by GitHub
commit ed8802bd1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -74,9 +74,9 @@ Just a quick test of our functions: [English ASR](link/hubdetail?name=deepspeech
Developers can have a try of our model with only a few lines of code. Developers can have a try of our model with only a few lines of code.
A tiny *ASR* DeepSpeech2 model training on toy set of LibriSpeech: A tiny **ASR** DeepSpeech2 model training on toy set of LibriSpeech:
```shell ```bash
cd examples/tiny/s0/ cd examples/tiny/s0/
# source the environment # source the environment
source path.sh source path.sh
@ -86,16 +86,34 @@ bash local/data.sh
bash local/test.sh conf/deepspeech2.yaml ckptfile offline bash local/test.sh conf/deepspeech2.yaml ckptfile offline
``` ```
For *TTS*, try FastSpeech2 on LJSpeech: For **TTS**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
- Download LJSpeech-1.1 from the [ljspeech official website](https://keithito.com/LJ-Speech-Dataset/), our prepared durations for fastspeech2 [ljspeech_alignment](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz).
- The pretrained models are seperated into two parts: [fastspeech2_nosil_ljspeech_ckpt](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) and [pwg_ljspeech_ckpt](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip). Please download then unzip to `./model/fastspeech2` and `./model/pwg` respectively.
- Assume your path to the dataset is `~/datasets/LJSpeech-1.1` and `./ljspeech_alignment` accordingly, preprocess your data and then use our pretrained model to synthesize:
```shell
bash ./local/preprocess.sh conf/default.yaml
bash ./local/synthesize_e2e.sh conf/default.yaml ./model/fastspeech2/snapshot_iter_100000.pdz ./model/pwg/pwg_snapshot_iter_400000.pdz
```
```bash
cd examples/csmsc/tts3
# download the pretrained models and unaip them
wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
unzip pwg_baker_ckpt_0.4.zip
wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip fastspeech2_nosil_baker_ckpt_0.4.zip
# source the environment
source path.sh
# run end-to-end synthesize
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize_e2e.py \
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--inference-dir=exp/default/inference \
--device="gpu" \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
```
If you want to try more functions like training and tuning, please see [ASR getting started](docs/source/asr/getting_started.md) and [TTS Basic Use](/docs/source/tts/basic_usage.md). If you want to try more functions like training and tuning, please see [ASR getting started](docs/source/asr/getting_started.md) and [TTS Basic Use](/docs/source/tts/basic_usage.md).

@ -0,0 +1,5 @@
.wy-nav-content {
max-width: 80%;
}
.table table{ background:#b9b9b9}
.table table td{ background:#FFF; }

@ -79,6 +79,9 @@ smartquotes = False
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static'] html_static_path = ['_static']
html_logo = '../images/paddle.png' html_logo = '../images/paddle.png'
html_css_files = [
'custom.css',
]
# -- Extension configuration ------------------------------------------------- # -- Extension configuration -------------------------------------------------
# numpydoc_show_class_members = False # numpydoc_show_class_members = False

File diff suppressed because it is too large Load Diff

@ -5,3 +5,283 @@ This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use t
We use Espnet's released models here. We use Espnet's released models here.
FastSpeech2 + Parallel WaveGAN in CSMSC FastSpeech2 + Parallel WaveGAN in CSMSC
.. raw:: html
<div class="table">
<table border="2" cellspacing="1" cellpadding="1">
<tr>
<th align="center"> Text </th>
<th align="center"> Espent TTS </th>
<th align="center"> PaddleSpeech TTS </th>
</tr>
<tr>
<td>早上好今天是2020/10/29最低温度是-3°C。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>你好我的编号是37249很高兴为您服务。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我们公司有37249个人。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我出生于2005年10月8日。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我们习惯在12:30吃中午饭。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>只要有超过3/4的人投票同意你就会成为我们的新班长。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我要买一只价值999.9元的手表。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我的手机号是18544139121欢迎来电。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>明天有62%的概率降雨。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>手表厂有五种好产品。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/010.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/010.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>跑马场有五百匹很勇敢的千里马。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/011.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/011.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/012.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/012.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>史小姐拿着小雨伞去找她的老保姆了。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/013.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/013.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>不要相信这个老奶奶说的话,她一点儿也不好。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/014.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/014.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
</div>

@ -0,0 +1,14 @@
001 早上好今天是2020/10/29最低温度是-3°C。
002 你好我的编号是37249很高兴为您服务。
003 我们公司有37249个人。
004 我出生于2005年10月8日。
005 我们习惯在12:30吃中午饭。
006 只要有超过3/4的人投票同意你就会成为我们的新班长。
007 我要买一只价值999.9元的手表。
008 我的手机号是18544139121欢迎来电。
009 明天有62%的概率降雨。
010 手表厂有五种好产品。
011 跑马场有五百匹很勇敢的千里马。
012 有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。
013 史小姐拿着小雨伞去找她的老保姆了。
014 不要相信这个老奶奶说的话,她一点儿也不好。

@ -419,9 +419,18 @@ class FastSpeech2(nn.Layer):
if is_inference: if is_inference:
# (B, Tmax) # (B, Tmax)
d_outs = self.duration_predictor.inference(hs, d_masks) if ds is not None:
d_outs = ds
else:
d_outs = self.duration_predictor.inference(hs, d_masks)
if ps is not None:
p_outs = ps
if es is not None:
e_outs = es
# use prediction in inference # use prediction in inference
# (B, Tmax, 1) # (B, Tmax, 1)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1)) (0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
@ -516,7 +525,7 @@ class FastSpeech2(nn.Layer):
x = paddle.cast(text, 'int64') x = paddle.cast(text, 'int64')
y = speech y = speech
spemb = spembs spemb = spembs
if durations: if durations is not None:
d = paddle.cast(durations, 'int64') d = paddle.cast(durations, 'int64')
p, e = pitch, energy p, e = pitch, energy
# setup batch axis # setup batch axis
@ -534,9 +543,12 @@ class FastSpeech2(nn.Layer):
if use_teacher_forcing: if use_teacher_forcing:
# use groundtruth of duration, pitch, and energy # use groundtruth of duration, pitch, and energy
ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0) ds = d.unsqueeze(0) if d is not None else None
ps = p.unsqueeze(0) if p is not None else None
es = e.unsqueeze(0) if e is not None else None
# ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
# (1, L, odim) # (1, L, odim)
_, outs, *_ = self._forward( _, outs, d_outs, *_ = self._forward(
xs, xs,
ilens, ilens,
ys, ys,
@ -545,10 +557,11 @@ class FastSpeech2(nn.Layer):
es=es, es=es,
spembs=spembs, spembs=spembs,
spk_id=spk_id, spk_id=spk_id,
tone_id=tone_id) tone_id=tone_id,
is_inference=True)
else: else:
# (1, L, odim) # (1, L, odim)
_, outs, *_ = self._forward( _, outs, d_outs, *_ = self._forward(
xs, xs,
ilens, ilens,
ys, ys,

Loading…
Cancel
Save