Merge branch 'develop' into datapipe

pull/1012/head
Hui Zhang 3 years ago
commit 2ba3f00bbd

@ -1,15 +0,0 @@
unset GREP_OPTIONS
# https://zhuanlan.zhihu.com/p/33050965
alias nvs='nvidia-smi'
alias his='history'
alias jobs='jobs -l'
alias ports='netstat -tulanp'
alias wget='wget -c'
## Colorize the grep command output for ease of use (good for log files)##
alias grep='grep --color=auto'
alias egrep='egrep --color=auto'
alias fgrep='fgrep --color=auto'

@ -53,7 +53,7 @@ pull_request_rules:
add: ["T2S"]
- name: "auto add label=Audio"
conditions:
- files~=^paddleaudio/
- files~=^audio/
actions:
label:
add: ["Audio"]
@ -69,6 +69,12 @@ pull_request_rules:
actions:
label:
add: ["Example"]
- name: "auto add label=Demo"
conditions:
- files~=^demos/
actions:
label:
add: ["Demo"]
- name: "auto add label=README"
conditions:
- files~=README.md
@ -77,7 +83,7 @@ pull_request_rules:
add: ["README"]
- name: "auto add label=Documentation"
conditions:
- files~=^doc/
- files~=^docs/
actions:
label:
add: ["Documentation"]

@ -13,8 +13,8 @@
files: (?!.*paddle)^.*$
- id: end-of-file-fixer
files: \.md$
- id: trailing-whitespace
files: \.md$
#- id: trailing-whitespace
# files: \.md$
- id: requirements-txt-fixer
exclude: (?=third_party).*$
- id: check-yaml

468
.vimrc

@ -1,468 +0,0 @@
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Maintainer:
" Amir Salihefendic — @amix3k
"
" Awesome_version:
" Get this config, nice color schemes and lots of plugins!
"
" Install the awesome version from:
"
" https://github.com/amix/vimrc
"
" Sections:
" -> General
" -> VIM user interface
" -> Colors and Fonts
" -> Files and backups
" -> Text, tab and indent related
" -> Visual mode related
" -> Moving around, tabs and buffers
" -> Status line
" -> Editing mappings
" -> vimgrep searching and cope displaying
" -> Spell checking
" -> Misc
" -> Helper functions
"
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => General
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Sets how many lines of history VIM has to remember
set history=500
" Enable filetype plugins
filetype plugin on
filetype indent on
" Set to auto read when a file is changed from the outside
set autoread
au FocusGained,BufEnter * checktime
" With a map leader it's possible to do extra key combinations
" like <leader>w saves the current file
let mapleader = ","
" Fast saving
nmap <leader>w :w!<cr>
" :W sudo saves the file
" (useful for handling the permission-denied error)
command! W execute 'w !sudo tee % > /dev/null' <bar> edit!
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => VIM user interface
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Set 7 lines to the cursor - when moving vertically using j/k
set so=7
" Avoid garbled characters in Chinese language windows OS
let $LANG='en'
set langmenu=en
source $VIMRUNTIME/delmenu.vim
source $VIMRUNTIME/menu.vim
" Turn on the Wild menu
set wildmenu
" Ignore compiled files
set wildignore=*.o,*~,*.pyc
if has("win16") || has("win32")
set wildignore+=.git\*,.hg\*,.svn\*
else
set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store
endif
"Always show current position
set ruler
" Height of the command bar
set cmdheight=1
" A buffer becomes hidden when it is abandoned
set hid
" Configure backspace so it acts as it should act
set backspace=eol,start,indent
set whichwrap+=<,>,h,l
" Ignore case when searching
set ignorecase
" When searching try to be smart about cases
set smartcase
" Highlight search results
set hlsearch
" Makes search act like search in modern browsers
set incsearch
" Don't redraw while executing macros (good performance config)
set lazyredraw
" For regular expressions turn magic on
set magic
" Show matching brackets when text indicator is over them
set showmatch
" How many tenths of a second to blink when matching brackets
set mat=2
" No annoying sound on errors
set noerrorbells
set novisualbell
set t_vb=
set tm=500
" Properly disable sound on errors on MacVim
if has("gui_macvim")
autocmd GUIEnter * set vb t_vb=
endif
" Add a bit extra margin to the left
set foldcolumn=1
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Colors and Fonts
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Enable syntax highlighting
syntax enable
" Enable 256 colors palette in Gnome Terminal
if $COLORTERM == 'gnome-terminal'
set t_Co=256
endif
try
colorscheme desert
catch
endtry
set background=dark
" Set extra options when running in GUI mode
if has("gui_running")
set guioptions-=T
set guioptions-=e
set t_Co=256
set guitablabel=%M\ %t
endif
" Set utf8 as standard encoding and en_US as the standard language
set encoding=utf8
set fileencodings=ucs-bom,utf-8,cp936
set fileencoding=gb2312
set termencoding=utf-8
" Use Unix as the standard file type
set ffs=unix,dos,mac
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Files, backups and undo
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Turn backup off, since most stuff is in SVN, git etc. anyway...
set nobackup
set nowb
set noswapfile
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Text, tab and indent related
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Use spaces instead of tabs
set expandtab
" Be smart when using tabs ;)
set smarttab
" 1 tab == 4 spaces
set shiftwidth=4
set tabstop=4
" Linebreak on 500 characters
set lbr
set tw=500
set ai "Auto indent
set si "Smart indent
set wrap "Wrap lines
""""""""""""""""""""""""""""""
" => Visual mode related
""""""""""""""""""""""""""""""
" Visual mode pressing * or # searches for the current selection
" Super useful! From an idea by Michael Naumann
vnoremap <silent> * :<C-u>call VisualSelection('', '')<CR>/<C-R>=@/<CR><CR>
vnoremap <silent> # :<C-u>call VisualSelection('', '')<CR>?<C-R>=@/<CR><CR>
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Moving around, tabs, windows and buffers
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Map <Space> to / (search) and Ctrl-<Space> to ? (backwards search)
map <space> /
map <C-space> ?
" Disable highlight when <leader><cr> is pressed
map <silent> <leader><cr> :noh<cr>
" Smart way to move between windows
map <C-j> <C-W>j
map <C-k> <C-W>k
map <C-h> <C-W>h
map <C-l> <C-W>l
" Close the current buffer
map <leader>bd :Bclose<cr>:tabclose<cr>gT
" Close all the buffers
map <leader>ba :bufdo bd<cr>
map <leader>l :bnext<cr>
map <leader>h :bprevious<cr>
" Useful mappings for managing tabs
map <leader>tn :tabnew<cr>
map <leader>to :tabonly<cr>
map <leader>tc :tabclose<cr>
map <leader>tm :tabmove
map <leader>t<leader> :tabnext
" Let 'tl' toggle between this and the last accessed tab
let g:lasttab = 1
nmap <Leader>tl :exe "tabn ".g:lasttab<CR>
au TabLeave * let g:lasttab = tabpagenr()
" Opens a new tab with the current buffer's path
" Super useful when editing files in the same directory
map <leader>te :tabedit <C-r>=expand("%:p:h")<cr>/
" Switch CWD to the directory of the open buffer
map <leader>cd :cd %:p:h<cr>:pwd<cr>
" Specify the behavior when switching between buffers
try
set switchbuf=useopen,usetab,newtab
set stal=2
catch
endtry
" Return to last edit position when opening files (You want this!)
au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif
""""""""""""""""""""""""""""""
" => Status line
""""""""""""""""""""""""""""""
" Always show the status line
set laststatus=2
" Format the status line
set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Editing mappings
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Remap VIM 0 to first non-blank character
map 0 ^
" Move a line of text using ALT+[jk] or Command+[jk] on mac
nmap <M-j> mz:m+<cr>`z
nmap <M-k> mz:m-2<cr>`z
vmap <M-j> :m'>+<cr>`<my`>mzgv`yo`z
vmap <M-k> :m'<-2<cr>`>my`<mzgv`yo`z
if has("mac") || has("macunix")
nmap <D-j> <M-j>
nmap <D-k> <M-k>
vmap <D-j> <M-j>
vmap <D-k> <M-k>
endif
" Delete trailing white space on save, useful for some filetypes ;)
fun! CleanExtraSpaces()
let save_cursor = getpos(".")
let old_query = getreg('/')
silent! %s/\s\+$//e
call setpos('.', save_cursor)
call setreg('/', old_query)
endfun
if has("autocmd")
autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces()
endif
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Spell checking
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Pressing ,ss will toggle and untoggle spell checking
map <leader>ss :setlocal spell!<cr>
" Shortcuts using <leader>
map <leader>sn ]s
map <leader>sp [s
map <leader>sa zg
map <leader>s? z=
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Misc
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Remove the Windows ^M - when the encodings gets messed up
noremap <Leader>m mmHmt:%s/<C-V><cr>//ge<cr>'tzt'm
" Quickly open a buffer for scribble
map <leader>q :e ~/buffer<cr>
" Quickly open a markdown buffer for scribble
map <leader>x :e ~/buffer.md<cr>
" Toggle paste mode on and off
map <leader>pp :setlocal paste!<cr>
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Helper functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Returns true if paste mode is enabled
function! HasPaste()
if &paste
return 'PASTE MODE '
endif
return ''
endfunction
" Don't close window, when deleting a buffer
command! Bclose call <SID>BufcloseCloseIt()
function! <SID>BufcloseCloseIt()
let l:currentBufNum = bufnr("%")
let l:alternateBufNum = bufnr("#")
if buflisted(l:alternateBufNum)
buffer #
else
bnext
endif
if bufnr("%") == l:currentBufNum
new
endif
if buflisted(l:currentBufNum)
execute("bdelete! ".l:currentBufNum)
endif
endfunction
function! CmdLine(str)
call feedkeys(":" . a:str)
endfunction
function! VisualSelection(direction, extra_filter) range
let l:saved_reg = @"
execute "normal! vgvy"
let l:pattern = escape(@", "\\/.*'$^~[]")
let l:pattern = substitute(l:pattern, "\n$", "", "")
if a:direction == 'gv'
call CmdLine("Ack '" . l:pattern . "' " )
elseif a:direction == 'replace'
call CmdLine("%s" . '/'. l:pattern . '/')
endif
let @/ = l:pattern
let @" = l:saved_reg
endfunction
""""""""""""""""""""""""""""""
" => Python section
""""""""""""""""""""""""""""""
let python_highlight_all = 1
au FileType python syn keyword pythonDecorator True None False self
au BufNewFile,BufRead *.jinja set syntax=htmljinja
au BufNewFile,BufRead *.mako set ft=mako
au FileType python map <buffer> F :set foldmethod=indent<cr>
au FileType python inoremap <buffer> $r return
au FileType python inoremap <buffer> $i import
au FileType python inoremap <buffer> $p print
au FileType python inoremap <buffer> $f # --- <esc>a
au FileType python map <buffer> <leader>1 /class
au FileType python map <buffer> <leader>2 /def
au FileType python map <buffer> <leader>C ?class
au FileType python map <buffer> <leader>D ?def
""""""""""""""""""""""""""""""
" => JavaScript section
"""""""""""""""""""""""""""""""
au FileType javascript call JavaScriptFold()
au FileType javascript setl fen
au FileType javascript setl nocindent
au FileType javascript imap <C-t> $log();<esc>hi
au FileType javascript imap <C-a> alert();<esc>hi
au FileType javascript inoremap <buffer> $r return
au FileType javascript inoremap <buffer> $f // --- PH<esc>FP2xi
function! JavaScriptFold()
setl foldmethod=syntax
setl foldlevelstart=1
syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend
function! FoldText()
return substitute(getline(v:foldstart), '{.*', '{...}', '')
endfunction
setl foldtext=FoldText()
endfunction
""""""""""""""""""""""""""""""
" => CoffeeScript section
"""""""""""""""""""""""""""""""
function! CoffeeScriptFold()
setl foldmethod=indent
setl foldlevelstart=1
endfunction
au FileType coffee call CoffeeScriptFold()
au FileType gitcommit call setpos('.', [0, 1, 1, 0])
""""""""""""""""""""""""""""""
" => Shell section
""""""""""""""""""""""""""""""
if exists('$TMUX')
if has('nvim')
set termguicolors
else
set term=screen-256color
endif
endif
""""""""""""""""""""""""""""""
" => Twig section
""""""""""""""""""""""""""""""
autocmd BufRead *.twig set syntax=html filetype=html
""""""""""""""""""""""""""""""
" => Markdown
""""""""""""""""""""""""""""""
let vim_markdown_folding_disabled = 1

@ -1,11 +1,5 @@
English | [简体中文](README_ch.md)
# PaddleSpeech
<p align="center">
<img src="./docs/images/PaddleSpeech_log.png" />
<img src="./docs/images/PaddleSpeech_logo.png" />
</p>
<div align="center">
@ -16,79 +10,122 @@ English | [简体中文](README_ch.md)
</div>
------------------------------------------------------------------------------------
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
![support os](https://img.shields.io/badge/os-linux-yellow.svg)
<!---
why they should use your module,
how they can install it,
how they can use it
from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readmes-readable.md
1.What is this repo or project? (You can reuse the repo description you used earlier because this section doesnt have to be long.)
2.How does it work?
3.Who will use this repo or project?
4.What is the goal of this project?
-->
**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech, with state-of-art and influential models.
**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech, with the state-of-art and influential models.
Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at:
- **Fast and Light-weight**: we provide high-speed and ultra-lightweight models that are convenient for industrial deployment.
- **Rule-based Chinese frontend**: our frontend contains Text Normalization (TN) and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context.
- **Varieties of Functions that Vitalize both Industrial and Academia**:
- *Implementation of critical audio tasks*: this toolkit contains audio functions like Speech Translation (ST), Automatic Speech Recognition (ASR), Text-To-Speech Synthesis (TTS), Voice Cloning(VC), Punctuation Restoration, etc.
- *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model lists](#models-list) for more details.
- *Cross-domain application*: as an extension of the application of traditional audio tasks, we combine the aforementioned tasks with other fields like NLP.
##### Speech-to-Text
Let's install PaddleSpeech with only a few lines of code!
<div align = "center">
<table style="width:100%">
<thead>
<tr>
<th> Input Audio </th>
<th width="550"> Recognition Result </th>
</tr>
</thead>
<tbody>
<tr>
<td align = "center">
<a href="https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav" rel="nofollow">
<img align="center" src="./docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
</td>
<td >I knocked at the door on the ancient side of the building.</td>
</tr>
<tr>
<td align = "center">
<a href="https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav" rel="nofollow">
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td>
<td>我认为跑步最重要的就是给我带来了身体健康。</td>
</tr>
</tbody>
</table>
>Note: The official name is still deepspeech. 2021/10/26
</div>
If you are using Ubuntu, PaddleSpeech can be set up with pip installation (with root privilege).
```shell
git clone https://github.com/PaddlePaddle/DeepSpeech.git
cd DeepSpeech
pip install -e .
```
##### Text-to-Speech
<div align = "center">
<table style="width:100%">
<thead>
<tr>
<th><img width="200" height="1"> Input Text <img width="200" height="1"> </th>
<th>Synthetic Audio</th>
</tr>
</thead>
<tbody>
<tr>
<td >Life was like a box of chocolates, you never know what you're gonna get.</td>
<td align = "center">
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/001.wav" rel="nofollow">
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td>
</tr>
<tr>
<td >早上好今天是2020/10/29最低温度是-3°C。</td>
<td align = "center">
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav" rel="nofollow">
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td>
</tr>
</tbody>
</table>
</div>
## Table of Contents
For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html).
The contents of this README is as follow:
- [Alternative Installation](#alternative-installation)
- [Quick Start](#quick-start)
- [Models List](#models-list)
- [Tutorials](#tutorials)
- [FAQ and Contributing](#faq-and-contributing)
- [License](#license)
- [Acknowledgement](#acknowledgement)
Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at:
- **Fast and Light-weight**: we provide high-speed and ultra-lightweight models that are convenient for industrial deployment.
- **Rule-based Chinese frontend**: our frontend contains Text Normalization and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context.
- **Varieties of Functions that Vitalize both Industrial and Academia**:
- *Implementation of critical audio tasks*: this toolkit contains audio functions like Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, Voice Cloning, etc.
- *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details.
- *Cascaded models application*: as an extension of the application of traditional audio tasks, we combine the workflows of aforementioned tasks with other fields like Natural language processing (NLP), like Punctuation Restoration.
## Alternative Installation
## Installation
The base environment in this page is
- Ubuntu 16.04
- python>=3.7
- paddlepaddle==2.1.2
- paddlepaddle>=2.2.0
If you want to set up PaddleSpeech in other environment, please see the [ASR installation](docs/source/asr/install.md) and [TTS installation](docs/source/tts/install.md) documents for all the alternatives.
If you want to set up PaddleSpeech in other environment, please see the [installation](./docs/source/install.md) documents for all the alternatives.
## Quick Start
> Note: the current links to `English ASR` and `English TTS` are not valid.
Just a quick test of our functions: [English ASR](link/hubdetail?name=deepspeech2_aishell&en_category=AutomaticSpeechRecognition) and [English TTS](link/hubdetail?name=fastspeech2_baker&en_category=TextToSpeech) by typing message or upload your own audio file.
Developers can have a try of our model with only a few lines of code.
A tiny **ASR** DeepSpeech2 model training on toy set of LibriSpeech:
A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech:
```bash
```shell
cd examples/tiny/s0/
# source the environment
source path.sh
# prepare librispeech dataset
bash local/data.sh
# evaluate your ckptfile model file
bash local/test.sh conf/deepspeech2.yaml ckptfile offline
source ../../../utils/parse_options.sh
# prepare data
bash ./local/data.sh
# train model, all `ckpt` under `exp` dir, if you use paddlepaddle-gpu, you can set CUDA_VISIBLE_DEVICES before the train script
./local/train.sh conf/deepspeech2.yaml deepspeech2 offline
# avg n best model to get the test model, in this case, n = 1
avg.sh best exp/deepspeech2/checkpoints 1
# evaluate the test model
./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 offline
```
For **TTS**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
```bash
For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
```shell
cd examples/csmsc/tts3
# download the pretrained models and unaip them
wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
@ -110,30 +147,25 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--inference-dir=exp/default/inference \
--device="gpu" \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
```
If you want to try more functions like training and tuning, please see [ASR getting started](docs/source/asr/getting_started.md) and [TTS Basic Use](/docs/source/tts/basic_usage.md).
If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-To-Speech Quick Start](./docs/source/tts/quick_start.md).
## Models List
## Model List
PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
ASR module contains *Acoustic Model* and *Language Model*, with the following details:
Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
<!---
The current hyperlinks redirect to [Previous Parakeet](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples).
-->
> Note: The `Link` should be code path rather than download links.
<table>
<table style="width:100%">
<thead>
<tr>
<th>ASR Module Type</th>
<th>Speech-to-Text Module Type</th>
<th>Dataset</th>
<th>Model Type</th>
<th>Link</th>
@ -141,76 +173,61 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
</thead>
<tbody>
<tr>
<td rowspan="6">Acoustic Model</td>
<td rowspan="4" >Aishell</td>
<td >2 Conv + 5 LSTM layers with only forward direction</td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz">Ds2 Online Aishell Model</a>
</td>
</tr>
<tr>
<td>2 Conv + 3 bidirectional GRU layers</td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz">Ds2 Offline Aishell Model</a>
</td>
</tr>
<tr>
<td>Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC</td>
<td rowspan="3">Acoustic Model</td>
<td rowspan="2" >Aishell</td>
<td >DeepSpeech2 RNN + Conv based Models</td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz">Conformer Offline Aishell Model</a>
<a href = "./examples/aishell/s0">deepspeech2-aishell</a>
</td>
</tr>
<tr>
<td >Encoder:Conformer, Decoder:Transformer, Decoding method: Attention</td>
<td>Transformer based Attention Models </td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz">Conformer Librispeech Model</a>
<a href = "./examples/aishell/s1">u2.transformer.conformer-aishell</a>
</td>
</tr>
<tr>
<td rowspan="2"> Librispeech</td>
<td>Encoder:Conformer, Decoder:Transformer, Decoding method: Attention</td>
<td> <a href = "https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz">Conformer Librispeech Model</a> </td>
</tr>
<tr>
<td>Encoder:Transformer, Decoder:Transformer, Decoding method: Attention</td>
<td> Librispeech</td>
<td>Transformer based Attention Models </td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz">Transformer Librispeech Model</a>
<a href = "./examples/librispeech/s0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/s1">transformer.conformer.u2-librispeech</a> / <a href = "./examples/librispeech/s2">transformer.conformer.u2-kaldi-librispeech</a>
</td>
</tr>
<tr>
<td rowspan="3">Language Model</td>
<td >CommonCrawl(en.00)</td>
<td >English Language Model</td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm">English Language Model</a>
</td>
</tr>
<tr>
<td rowspan="2">Baidu Internal Corpus</td>
<td>Mandarin Language Model Small</td>
<tr>
<td>Alignment</td>
<td>THCHS30</td>
<td>MFA</td>
<td>
<a href = ".examples/thchs30/a0">mfa-thchs30</a>
</td>
</tr>
<tr>
<td rowspan="2">Language Model</td>
<td colspan = "2">Ngram Language Model</td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm">Mandarin Language Model Small</a>
<a href = "./examples/other/ngram_lm">kenlm</a>
</td>
</tr>
<tr>
<td >Mandarin Language Model Large</td>
<td>TIMIT</td>
<td>Unified Streaming & Non-streaming Two-pass</td>
<td>
<a href = "https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm">Mandarin Language Model Large</a>
<a href = "./examples/timit/s1"> u2-timit</a>
</td>
</tr>
</tbody>
</table>
PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow:
PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow:
<table>
<thead>
<tr>
<th>TTS Module Type</th>
<th>Model Type</th>
<th>Dataset</th>
<th>Link</th>
<th> Text-to-Speech Module Type <img width="110" height="1"> </th>
<th> Model Type </th>
<th> <img width="50" height="1"> Dataset <img width="50" height="1"> </th>
<th> <img width="101" height="1"> Link <img width="105" height="1"> </th>
</tr>
</thead>
<tbody>
@ -218,15 +235,15 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model
<td> Text Frontend</td>
<td colspan="2"> &emsp; </td>
<td>
<a href = "./examples/other/text_frontend">chinese-fronted</a>
<a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
</td>
</tr>
<tr>
<td rowspan="7">Acoustic Model</td>
<td rowspan="4">Acoustic Model</td>
<td >Tacotron2</td>
<td rowspan="2" >LJSpeech</td>
<td>
<a href = "./examples/ljspeech/tts0">tacotron2-vctk</a>
<a href = "./examples/ljspeech/tts0">tacotron2-ljspeech</a>
</td>
</tr>
<tr>
@ -243,28 +260,14 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model
</td>
</tr>
<tr>
<td rowspan="4">FastSpeech2</td>
<td>AISHELL-3</td>
<td>FastSpeech2</td>
<td>AISHELL-3 / VCTK / LJSpeech / CSMSC</td>
<td>
<a href = "./examples/aishell3/tts3">fastspeech2-aishell3</a>
</td>
</tr>
<tr>
<td>VCTK</td>
<td> <a href = "./examples/vctk/tts3">fastspeech2-vctk</a> </td>
</tr>
<tr>
<td>LJSpeech</td>
<td> <a href = "./examples/ljspeech/tts3">fastspeech2-ljspeech</a> </td>
</tr>
<tr>
<td>CSMSC</td>
<td>
<a href = "./examples/csmsc/tts3">fastspeech2-csmsc</a>
<a href = "./examples/aishell3/tts3">fastspeech2-aishell3</a> / <a href = "./examples/vctk/tts3">fastspeech2-vctk</a> / <a href = "./examples/ljspeech/tts3">fastspeech2-ljspeech</a> / <a href = "./examples/csmsc/tts3">fastspeech2-csmsc</a>
</td>
</tr>
<tr>
<td rowspan="4">Vocoder</td>
<td rowspan="2">Vocoder</td>
<td >WaveFlow</td>
<td >LJSpeech</td>
<td>
@ -272,22 +275,10 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model
</td>
</tr>
<tr>
<td rowspan="3">Parallel WaveGAN</td>
<td >LJSpeech</td>
<td>
<a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a>
</td>
</tr>
<tr>
<td >VCTK</td>
<td >Parallel WaveGAN</td>
<td >LJSpeech / VCTK / CSMSC</td>
<td>
<a href = "./examples/vctk/voc1">PWGAN-vctk</a>
</td>
</tr>
<tr>
<td >CSMSC</td>
<td>
<a href = "./examples/csmsc/voc1">PWGAN-csmsc</a>
<a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a>
</td>
</tr>
<tr>
@ -309,30 +300,48 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model
</tbody>
</table>
## Tutorials
Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. If you want to focus on the two tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
The original ASR module is based on [Baidu's DeepSpeech](https://arxiv.org/abs/1412.5567) which is an independent product named [DeepSpeech](https://deepspeech.readthedocs.io). However, the toolkit aligns almost all the SoTA modules in the pipeline. Specifically, these modules are
Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
- [Overview](./docs/source/introduction.md)
- Quick Start
- [Dependencies](./docs/source/dependencies.md) and [Installation](./docs/source/install.md)
- [Quick Start of Speech-to-Text](./docs/source/asr/quick_start.md)
- [Quick Start of Text-to-Speech](./docs/source/tts/quick_start.md)
- Speech-to-Text
- [Models Introduction](./docs/source/asr/models_introduction.md)
- [Data Preparation](./docs/source/asr/data_preparation.md)
- [Data Augmentation Pipeline](./docs/source/asr/augmentation.md)
- [Features](./docs/source/asr/feature_list.md)
- [Ngram LM](./docs/source/asr/ngram_lm.md)
- Text-to-Speech
- [Introduction](./docs/source/tts/models_introduction.md)
- [Advanced Usage](./docs/source/tts/advanced_usage.md)
- [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md)
- [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html)
- [Released Models](./docs/source/released_model.md)
The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components.
* [Data Prepration](docs/source/asr/data_preparation.md)
* [Data Augmentation](docs/source/asr/augmentation.md)
* [Ngram LM](docs/source/asr/ngram_lm.md)
* [Benchmark](docs/source/asr/benchmark.md)
* [Relased Model](docs/source/asr/released_model.md)
The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components.
## FAQ and Contributing
You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you would like to contribute to this project!
## FAQ and Contributing
## Citation
You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/DeepSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/DeepSpeech/issues)! Also, we highly appreciate if you would like to contribute to this project!
To cite PaddleSpeech for research, please use the following format.
```tex
@misc{ppspeech2021,
title={PaddleSpeech, a toolkit for audio processing based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}},
year={2021}
}
```
## License
## License and Acknowledge
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement
PaddleSpeech depends on a lot of open source repos. See [references](docs/source/reference.md) for more information.
PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.

@ -0,0 +1,5 @@
# ASR
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh
```

@ -0,0 +1,46 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--wav_en", type=str)
parser.add_argument("--wav_zh", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
s2t_en_model = hub.Module(name='u2_conformer_librispeech')
s2t_zh_model = hub.Module(name='u2_conformer_aishell')
args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en))
args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh))
assert os.path.isfile(args.wav_en) and os.path.isfile(
args.wav_zh), 'Wav files not exist.'
print('[S2T][en]Wav: {}'.format(args.wav_en))
text_en = s2t_en_model.speech_recognize(args.wav_en)
print('[S2T][en]Text: {}'.format(text_en))
print('[S2T][zh]Wav: {}'.format(args.wav_zh))
text_zh = s2t_zh_model.speech_recognize(args.wav_zh)
print('[S2T][zh]Text: {}'.format(text_zh))

@ -0,0 +1,30 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
mkdir -p data
wav_en=data/en.wav
wav_zh=data/zh.wav
test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
python3 -u hub_infer.py \
--device ${device} \
--wav_en ${wav_en} \
--wav_zh ${wav_zh}
exit 0

@ -0,0 +1 @@
data

@ -0,0 +1,13 @@
# echo system
ASR + TTS
中文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
```
英文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
```

@ -0,0 +1,55 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import librosa
import paddle
import paddlehub as hub
import soundfile as sf
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
if args.lang == 'zh':
t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
s2t_model = hub.Module(name='u2_conformer_aishell')
else:
t2s_model = hub.Module(
name='fastspeech2_ljspeech', output_dir=output_dir)
s2t_model = hub.Module(name='u2_conformer_librispeech')
if isinstance(args.text, list):
args.text = ' '.join(args.text)
wavs = t2s_model.generate([args.text], device=args.device)
print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
# convert sr to 16k
x, sr = librosa.load(wavs[0])
y = librosa.resample(x, sr, 16000)
wav_16k = wavs[0].replace('.wav', '_16k.wav')
sf.write(wav_16k, y, 16000)
print('[S2T]Resample to 16k: {}'.format(wav_16k))
text = s2t_model.speech_recognize(wav_16k)
print('[S2T]Text recognized from wav file: {}'.format(text))

@ -0,0 +1,42 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
if [ $# != 2 -a $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
text=$1
output_dir=$2
if [ $# == 3 ];then
lang=$3
else
lang=zh
fi
if [ ! -d $output_dir ];then
mkdir -p $output_dir
fi
python3 -u hub_infer.py \
--lang ${lang} \
--device ${device} \
--text \"${text}\" \
--output_dir ${output_dir}
exit 0

Binary file not shown.

After

Width:  |  Height:  |  Size: 441 KiB

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -0,0 +1,61 @@
#!/bin/bash
source path.sh
gpus=0
stage=0
stop_stage=100
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
mkdir -p download
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# install PaddleGAN
git clone https://github.com/PaddlePaddle/PaddleGAN.git
pip install -e PaddleGAN/
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download pretrained PaddleGAN model
wget -P download https://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# download pretrained tts models and unzip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# run tts
CUDA_VISIBLE_DEVICES=${gpus} \
python3 ${BIN_DIR}/synthesize_e2e.py \
--fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=sentences.txt \
--output-dir=output/wavs \
--inference-dir=output/inference \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
# output/inference is not needed here, which save the static models
rm -rf output/inference
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# We only test one audio here, cause it's slow
CUDA_VISIBLE_DEVICES=${gpus} \
python3 PaddleGAN/applications/tools/wav2lip.py \
--checkpoint_path download/wav2lip_hq.pdparams \
--face Lamarr.png \
--audio output/wavs/000.wav \
--outfile output/tts_lips.mp4 \
--face_enhancement
fi

@ -0,0 +1 @@
000 谁知青蛙一落地,竟变成了一位英俊的王子。于是遵照国王的意思,他做了公主的亲密伴侣。

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

@ -0,0 +1,74 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import re
from pathlib import Path
import paddle
from paddleocr import draw_ocr
from paddleocr import PaddleOCR
from PIL import Image
def evaluate(args, ocr):
img_dir = Path(args.img_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
img_out_dir = output_dir / "imgs"
img_out_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "sentences.txt", "w") as wf:
for name in os.listdir(img_dir):
id = name.split(".")[0]
img_path = img_dir / name
result = ocr.ocr(str(img_path), cls=True)
# draw result
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(
image, boxes, txts, scores, font_path=args.font_path)
im_show = Image.fromarray(im_show)
paragraph = "".join(txts)
# 过滤出中文结果
pattern = re.compile(r'[^(\u4e00-\u9fa5)+,。?、]')
sentence = re.sub(pattern, '', paragraph)
im_show.save(img_out_dir / name)
wf.write(id + " " + sentence + "\n")
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(
description="Synthesize with fastspeech2 & parallel wavegan.")
parser.add_argument("--img-dir", default="imgs", type=str, help="img_dir.")
parser.add_argument(
"--output-dir",
type=str,
default="output",
help="output sentences path.")
parser.add_argument(
"--font-path", type=str, default="simfang.ttf", help="font path")
args = parser.parse_args()
paddle.set_device("gpu")
# need to run only once to download and load model into memory
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
evaluate(args, ocr)
if __name__ == "__main__":
main()

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -0,0 +1,50 @@
#!/bin/bash
source path.sh
gpus=0
stage=0
stop_stage=100
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
mkdir -p download
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# install PaddleOCR
pip install "paddleocr>=2.0.1"
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download pretrained tts models and unzip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# run ocr
CUDA_VISIBLE_DEVICES=${gpus} \
python3 ocr.py --img-dir=imgs --output-dir=output --font-path=simfang.ttf
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# run tts
CUDA_VISIBLE_DEVICES=${gpus} \
python3 ${BIN_DIR}/synthesize_e2e.py \
--fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=output/sentences.txt \
--output-dir=output/wavs \
--inference-dir=output/inference \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
# output/inference is not needed here, which save the static models
rm -rf output/inference
fi

Binary file not shown.

@ -0,0 +1,12 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -0,0 +1,38 @@
#!/bin/bash
source path.sh
gpus=0
stage=0
stop_stage=100
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
mkdir -p download
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download pretrained tts models and unzip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# run tts
CUDA_VISIBLE_DEVICES=${gpus} \
python3 style_syn.py \
--fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--fastspeech2-pitch-stat=download/fastspeech2_nosil_baker_ckpt_0.4/pitch_stats.npy \
--fastspeech2-energy-stat=download/fastspeech2_nosil_baker_ckpt_0.4/energy_stats.npy \
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=output \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi

@ -0,0 +1 @@
000 谁知青蛙一落地,竟变成了一位英俊的王子。于是遵照国王的意思,他做了公主的亲密伴侣。

@ -0,0 +1,210 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
def evaluate(args, fastspeech2_config, pwg_config):
# construct dataset for evaluation
sentences = []
with open(args.text, 'rt') as f:
for line in f:
utt_id, sentence = line.strip().split()
sentences.append((utt_id, sentence))
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
vocoder = PWGGenerator(**pwg_config["generator_params"])
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
vocoder.remove_weight_norm()
vocoder.eval()
print("model done!")
frontend = Frontend(phone_vocab_path=args.phones_dict)
print("frontend done!")
stat = np.load(args.fastspeech2_stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
stat = np.load(args.pwg_stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
pwg_normalizer = ZScore(mu, std)
fastspeech2_inference = StyleFastSpeech2Inference(
fastspeech2_normalizer, model, args.fastspeech2_pitch_stat,
args.fastspeech2_energy_stat)
fastspeech2_inference.eval()
pwg_inference = PWGInference(pwg_normalizer, vocoder)
pwg_inference.eval()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
styles = ["normal", "robot", "1.2xspeed", "0.8xspeed", "child_voice"]
for style in styles:
robot = False
durations = None
durations_scale = None
durations_bias = None
pitch = None
pitch_scale = None
pitch_bias = None
energy = None
energy_scale = None
energy_bias = None
if style == "robot":
# all tones in phones be `1`
# all pitch should be the same, we use mean here
robot = True
if style == "1.2xspeed":
durations_scale = 1 / 1.2
if style == "0.8xspeed":
durations_scale = 1 / 0.8
if style == "child_voice":
pitch_scale = 1.3
sub_output_dir = output_dir / style
sub_output_dir.mkdir(parents=True, exist_ok=True)
for utt_id, sentence in sentences:
input_ids = frontend.get_input_ids(
sentence, merge_sentences=True, robot=robot)
phone_ids = input_ids["phone_ids"][0]
with paddle.no_grad():
mel = fastspeech2_inference(
phone_ids,
durations=durations,
durations_scale=durations_scale,
durations_bias=durations_bias,
pitch=pitch,
pitch_scale=pitch_scale,
pitch_bias=pitch_bias,
energy=energy,
energy_scale=energy_scale,
energy_bias=energy_bias,
robot=robot)
wav = pwg_inference(mel)
sf.write(
str(sub_output_dir / (utt_id + ".wav")),
wav.numpy(),
samplerate=fastspeech2_config.fs)
print(f"{style}_{utt_id} done!")
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(
description="Synthesize with fastspeech2 & parallel wavegan.")
parser.add_argument(
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
parser.add_argument(
"--fastspeech2-checkpoint",
type=str,
help="fastspeech2 checkpoint to load.")
parser.add_argument(
"--fastspeech2-stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
)
parser.add_argument(
"--fastspeech2-pitch-stat",
type=str,
help="mean and standard deviation used to normalize pitch when training fastspeech2"
)
parser.add_argument(
"--fastspeech2-energy-stat",
type=str,
help="mean and standard deviation used to normalize energy when training fastspeech2."
)
parser.add_argument(
"--pwg-config", type=str, help="parallel wavegan config file.")
parser.add_argument(
"--pwg-checkpoint",
type=str,
help="parallel wavegan generator parameters to load.")
parser.add_argument(
"--pwg-stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
)
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument(
"--text",
type=str,
help="text to synthesize, a 'utt_id sentence' pair per line.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
if args.ngpu == 0:
paddle.set_device("cpu")
elif args.ngpu > 0:
paddle.set_device("gpu")
else:
print("ngpu should >= 0 !")
with open(args.fastspeech2_config) as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
with open(args.pwg_config) as f:
pwg_config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(fastspeech2_config)
print(pwg_config)
evaluate(args, fastspeech2_config, pwg_config)
if __name__ == "__main__":
main()

@ -0,0 +1,11 @@
# TTS
中文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
```
英文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
```

@ -0,0 +1,43 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
if args.lang == 'zh':
t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
else:
t2s_model = hub.Module(
name='fastspeech2_ljspeech', output_dir=output_dir)
if isinstance(args.text, list):
args.text = ' '.join(args.text)
wavs = t2s_model.generate([args.text], device=args.device)
print('[T2S]Wav file has been generated: {}'.format(wavs[0]))

@ -0,0 +1,42 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
if [ $# != 2 -a $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
text=$1
output_dir=$2
if [ $# == 3 ];then
lang=$3
else
lang=zh
fi
if [ ! -d $output_dir ];then
mkdir -p $output_dir
fi
python3 -u hub_infer.py \
--lang ${lang} \
--device ${device} \
--text \"${text}\" \
--output_dir ${output_dir}
exit 0

Before

Width:  |  Height:  |  Size: 72 KiB

After

Width:  |  Height:  |  Size: 72 KiB

@ -13,7 +13,7 @@ In addition, the training process and the testing process are also introduced.
The arcitecture of the model is shown in Fig.1.
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2onlineModel.png" width=800>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/ds2onlineModel.png" width=800>
<br/>Fig.1 The Arcitecture of deepspeech2 online model
</p>
@ -160,7 +160,7 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The
The arcitecture of the model is shown in Fig.2.
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2offlineModel.png" width=800>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/ds2offlineModel.png" width=800>
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
</p>

@ -0,0 +1,36 @@
# The Dependencies
## By apt-get
### The base dependencies:
```
bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libvorbis-dev libboost-dev swig python3-dev
```
### The dependencies of kenlm:
```
build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev gcc-5 g++-5
```
### The dependencies of sox:
```
libvorbis-dev libmp3lame-dev libmad-ocaml-dev
```
## By make or setup
```
kenlm
sox
mfa
openblas
kaldi
sctk
AutoLog
swig-decoder
python_kaldi_features
```

@ -23,7 +23,7 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Speech-To-Text
:caption: Speech-to-Text
asr/models_introduction
asr/data_preparation
@ -33,7 +33,7 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Text-To-Speech
:caption: Text-to-Speech
tts/basic_usage
tts/advanced_usage

@ -6,52 +6,38 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
- Python >= 3.7
- PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
## Setup (Important)
- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox`, and `swig`, e.g. installing them via `apt-get`:
```bash
sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
```
The version of `swig` should >= 3.0
or, installing them via `yum`:
```bash
sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel
wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz
xz -d flac-1.3.1.tar.xz
tar -xvf flac-1.3.1.tar
cd flac-1.3.1
./configure
make
make install
```
## Simple Setup
- Run the setup script for the remaining dependencies
For user who working on `Ubuntu` with `root` privilege.
```bash
```python
git clone https://github.com/PaddlePaddle/DeepSpeech.git
cd DeepSpeech
pushd tools; make virtualenv.done:; popd
source tools/venv/bin/activate
pip install -e .
```
- Source venv before do experiment.
For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
```bash
source tools/venv/bin/activate
```python
pushd tools
bash extras/install_miniconda.sh
popd
bash
```
## Simple Setup
After installing the conda, run the setup.sh to complete the installing process.
```python
git clone https://github.com/PaddlePaddle/DeepSpeech.git
cd DeepSpeech
pip install -e .
bash setup.sh
```
## Setup (Other Platform)
- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
- The version of `swig` should >= 3.0
- we will do more to simplify the install process.
## Running in Docker Container (optional)
Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.

@ -1,14 +1,37 @@
# PaddleSpeech
## What is PaddleSpeech?
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
## What can PaddleSpeech do?
### Speech-To-Text
(An introduce of ASR in PaddleSpeech is needed here!)
### Speech-to-Text
PaddleSpeech ASR mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
- Ready-to-run experiments.
PaddleSpeech ASR provides you with a complete ASR pipeline, including:
- Data Preparation
- Build vocabulary
- Compute Cepstral mean and variance normalization (CMVN)
- Featrue extraction
- linear
- fbank (also support kaldi feature)
- mfcc
- Acoustic Models
- Deepspeech2 (Streaming and Non-Streaming)
- Transformer (Streaming and Non-Streaming)
- Conformer (Streaming and Non-Streaming)
- Decoder
- ctc greedy search (used in DeepSpeech2, Transformer and Conformer)
- ctc beam search (used in DeepSpeech2, Transformer and Conformer)
- attention decoding (used in Transformer and Conformer)
- attention rescoring (used in Transformer and Conformer)
Speech-to-Text helps you training the ASR model very simply.
### Text-To-Speech
### Text-to-Speech
TTS mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@ -30,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
Text-To-Speech helps you to train TTS models with simple commands.
Text-to-Speech helps you to train TTS models with simple commands.

@ -1,6 +1,6 @@
# Reference
We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work:
We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work and opensource community!
* [espnet](https://github.com/espnet/espnet/blob/master/LICENSE)
- Apache-2.0 License

@ -1,15 +1,17 @@
# Released Models
## Speech-To-Text Models
## Speech-to-Text Models
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
[Ds2 Online Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/s0)
[Ds2 Offline Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/s0)
[Conformer Online Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
[Conformer Offline Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
[Conformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
[Transformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
[Transformer Librispeech S2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
### Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@ -26,30 +28,31 @@ Language Model | Training Data | Token-based | Size | Descriptions
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
## Text-To-Speech Models
## Text-to-Speech Models
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----
Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)|||
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)|||
SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
### Vocoders
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----
WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip.](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)
Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)|||
Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB|
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)|||
Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)|||
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)|||
|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB|
### Voice Cloning
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----
GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
:-------------:| :------------:| :-----: | :-----:
GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)

@ -642,8 +642,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
Multi-Speaker TTS
-------------------
PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset.
PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset. Each line is a different person.
.. raw:: html
@ -651,19 +650,381 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
<div class="table">
<table border="2" cellspacing="1" cellpadding="1">
<tr>
<th align="center"> Text </th>
<th align="center"> Origin </th>
<th align="center"> Target Timbre </th>
<th align="center"> Generated </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/0.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/0_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/1.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/1_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/2.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/2_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/3.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/3_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/4.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/4_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/5.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/5_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/6.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/6_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/7.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/7_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/8.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/8_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/9.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/9_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/10.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/10_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/11.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/11_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/12.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/12_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/13.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/13_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/14.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/14_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/15.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/15_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/16.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/16_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/17.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/17_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/18.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/18_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/19.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/19_002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<table>
<div>
<br>
<br>
Duration control in FastSpeech2
Style control in FastSpeech2
--------------------------------------
In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce.
In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``.
We provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce.
The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios.
@ -892,6 +1253,174 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
<br>
<br>
We provide the audio demos of pitch control here.
When we set pitch of one sentence to a mean value and set ``tones`` of phones to ``1``, we will get a ``robot-style`` timbre.
When we raise the pitch of an adult female (with a fixed scale ratio), we will get a ``child-style`` timbre.
The ``pitch`` of different phonemes in a sentence can also have different scale ratios.
The nomal audios are in the second column of the previous table.
.. raw:: html
<div class="table">
<table border="2" cellspacing="1" cellpadding="1">
<tr>
<th align="center"> Robot </th>
<th align="center"> Child </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<table>
<div>
<br>
<br>
Chinese TTS with/without text frontend
--------------------------------------

@ -6,4 +6,4 @@ Model | Generator Loss |Discriminator Loss
Parallel Wave GAN| adversial loss <br> Feature Matching | Multi-Scale Discriminator |
Mel GAN |adversial loss <br> Multi-resolution STFT loss | adversial loss|
Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|
HiFi GAN |adversial loss <br> Feature Matching <br> Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminato |
HiFi GAN |adversial loss <br> Feature Matching <br> Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminator|

@ -27,14 +27,14 @@ At present, there are two mainstream acoustic model structures.
- Acoustic decoder (N Frames - > N Frames).
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
</div>
- Sequence to sequence acoustic model:
- M Tokens - > N Frames.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
</div>
### Tacotron2
@ -54,7 +54,7 @@ At present, there are two mainstream acoustic model structures.
- CBHG postprocess.
- Vocoder: Griffin-Lim.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
</div>
**Advantage of Tacotron:**
@ -89,10 +89,10 @@ At present, there are two mainstream acoustic model structures.
- The alignment matrix of previous time is considered at the step `t` of decoder.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
</div>
You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0).
You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0).
### TransformerTTS
**Disadvantages of the Tacotrons:**
@ -118,7 +118,7 @@ Transformer TTS is a combination of Tacotron2 and Transformer.
- Positional Encoding.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/transformer.png" width=500 /> <br>
</div>
#### Transformer TTS
@ -138,7 +138,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
- Uniform scale position encoding may have a negative impact on input or output sequences.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
</div>
**Disadvantages of Transformer TTS:**
@ -146,7 +146,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
- The ability to perceive local information is weak, and local information is more related to pronunciation.
- Stability is worse than Tacotron2.
You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1).
You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1).
### FastSpeech2
@ -184,14 +184,14 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
• Can be generated in parallel (decoding time is less affected by sequence length)
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
</div>
#### FastPitch
[FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
</div>
#### FastSpeech2
@ -209,10 +209,10 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
FastSpeech2 is similar to FastPitch but introduces more variation information of speech.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
</div>
You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
### SpeedySpeech
[SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
@ -223,10 +223,10 @@ You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example
- Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
</div>
You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2).
You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2).
## Vocoders
In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
@ -276,7 +276,7 @@ Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Paralle
- It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
- It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.
You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
### Parallel WaveGAN
[Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
@ -289,7 +289,7 @@ You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examp
- Multi-resolution STFT loss.
<div align="left">
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/pwg.png" width=600 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/pwg.png" width=600 /> <br>
</div>
You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1).
You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1).

@ -18,7 +18,7 @@ The models in PaddleSpeech TTS have the following mapping relationship:
## Quick Start
Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc)
Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc)
### Train Parallel WaveGAN with CSMSC
- Go to directory

@ -1,5 +1,5 @@
# Chinese Rule Based Text Frontend
A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/text_frontend).
A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmles in [examples/other/tn](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tn) and [examples/other/g2p](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/g2p).
A text frontend module mainly includes:
- Text Segmentation

@ -0,0 +1,407 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "piano-accent",
"metadata": {},
"source": [
"# Derivative of CTC Loss\n",
"\n",
"关于CTC的介绍已经有很多不错的教程了但是完整的描述CTCLoss的前向和反向过程的很少而且有些公式推导省略和错误。本文主要关注CTC Loss的梯度是如何计算的关于CTC的介绍这里不做过多赘述具体参看文末参考。\n",
"\n",
"CTC主要应用于语音和OCR中已语音[Deepspeech2](https://arxiv.org/abs/1512.02595)模型为例CTC的网络一般如下图所示包含softmax和CTCLoss两部分。反向传播需要求得loss L相对于logits $u^i$的梯度。下面先介绍CTCLoss的前向计算。\n",
"\n",
"> 图片来源于文末参考\n",
"\n",
"![img](./img/ctc_loss_backward_1.png)\n",
"\n",
"## CTC Loss 的计算\n",
"\n",
"CTC中path的定义与概率的计算如下\n",
"\n",
"<img src=\"./img/ctc_loss_prob_pi_x.png\" alt=\"image-20211104200811966\" style=\"zoom:50%;\" />\n",
"\n",
"path 是 $ L'^T$​​的元素,用 $ \\pi $​​表示。 $ \\textbf{x} $ 是输入特征,$\\textbf{y}$ 是输出label 都是序列。 $ L $ 是输出的 vocab, L 是 $ L \\cup {blank}$​​。 $y_{\\pi_{t}}^t$ 表示在t时刻$\\pi_{t}$ label时的观察概率。其中$\\pi_{t}$ 表示 $\\pi$ path在t时刻的label。$\\pi$ 是 $\\textbf{y}$ 与 $ \\textbf{x}$ 的一个alignment长度是$T$​​,取值空间为$L'$。path也称为alignment。\n",
"\n",
"公式2解释了给定输入 $\\textbf{x}$ ,输出 $ \\pi $ path 的概率即从时间t=1到T每个时间点的概率 $y_{\\pi_{t}}^t$ 相乘。\n",
"\n",
"求出单条path后就可以计算$p(l \\mid x)$ 的概率,计算如下:\n",
"\n",
"<img src=\"./img/ctc_loss_prob_l_x.png\" alt=\"image-20211104202358513\" style=\"zoom:50%;\" />\n",
"\n",
"这里边 $\\mathcal{B}$ 就是映射, 即所有多对一的映射many-to-one mapping )的集合。 这样就算出来对应一个真正的 label $\\textbf{l}$ 的概率了,这里是求和。 求和的原因就是 aab 和 abb 都是对应成ab, 所以 aab 的概率 + abb 的概率才是生成ab的概率。 \n",
"\n",
"公式3解释了给定输入 $\\mathbf{x}$ ,求输出$\\mathbf{l}$ 的概率, 即所有集合 $\\mathcal{B}^{-1} (\\mathbf{l})$ 中 path的概率和。\n",
"\n",
"### CTC forward-backward 算法\n",
"\n",
"CTC的优化采用算最大似然估计[MLE (maximum likelihood estimation)](https://en.wikipedia.org/wiki/Maximum_likelihood_estimation), 这个和神经网络本身的训练过程是一致的。\n",
"\n",
"这个CTC 计算过程类似HMM的 [forward-backward algorithm](https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm),下面就是这个算法的推导过程:\n",
"\n",
"<img src=\"./img/ctc_loss_alpha_definition.png\" alt=\"image-20211104203040307\" style=\"zoom:50%;\" />\n",
"\n",
"上图中的定义很清楚, 但是$ \\alpha_{t-1}(s) $ and $ \\alpha_{t-1}(s-1)$ 和 $\\alpha_t(s)$ 的关系也不那么好看出来,下图给出了具体的关于 $\\alpha_t(s)$ 的推导过程:\n",
"\n",
"<img src=\"./img/ctc_loss_alpha_recurse.png\" alt=\"image-20211108155714843\" style=\"zoom:50%;\" />\n",
"\n",
"<img src=\"./img/ctc_loss_alpha_recurse_2.png\" alt=\"image-20211109153011816\" style=\"zoom:50%;\" />\n",
"\n",
"这里的公式比较适合用下面的图来理解,$\\alpha_1(1)$ 其实对应的就是下图中左上角白色的圆圈。 就是上来第一个是blank 的概率, 而 $\\alpha_1(2)$是label l 的第一个字母。 这里边我们假设每个字母之间都插入了空白即label l扩展成l'例如l=[a, b, b, c] l'=[-, a, -, b, -, b, -, c, -]。 然后对于其他圆点在时间是1 的情况下概率都是 0. Figure 3中横轴是时间 t从左到右是1到T纵轴是ssequence从上到下是 1 到 $\\mathbf{\\mid l' \\mid}$.\n",
"\n",
"<img src=\"./img/ctc_loss_cat_lattice.png\" alt=\"image-20211108155918442\" style=\"zoom:50%;\" />\n",
"\n",
"接下来我们分析递归公式 (resursion),更多介绍可以参看 [2]. 公式6分情况考虑:\n",
"\n",
"* 第一种情况就是当前的label是blank 或者 $\\mathbf{l'}_{s}= \\mathbf{l'}_{s-2}$(相邻是重复字符)\n",
"\n",
" ![img](https://distill.pub/2017/ctc/assets/cost_no_skip.svg)\n",
"\n",
" 这个时候他的概率来自于过去t-1的两个label 概率, 也就是 $a_{t-1} (s)$ 和 $a_{t-1} (s-1)$ 。\n",
"\n",
" $ a_{t-1} (s)$ 就是说当前的 sequence 已经是s 了figure 3中表现为横跳 blank -->blank例如t=3, s=3\n",
"\n",
" 而 $a_{t-1} (s-1) $是说明当前的字符还不够, 需要再加一个, 所以在figure 3中就是斜跳从黑色圆圈到白色圆圈例如t=3, s=5。\n",
"\n",
" 仔细观察figure 3 除了第一排的白色圆圈, 其他白色圆圈都有两个输入, 就是上述的两种情况。 当然判断blank 的方法也可以是判断$I'_{s-2} = I'_{s}$. 这种情况也是说明$I'_{s}$ 是blank, 因为每一个字符必须用 blank 隔开, 即使是相同字符。\n",
"\n",
"* 第二章情况 也可以用类似逻辑得出, 只不过当前的状态s 是黑色圆圈, 有三种情况输入。\n",
"\n",
" ![img](https://distill.pub/2017/ctc/assets/cost_regular.svg)\n",
"\n",
"最终的概率就如公式8 所示, 这个计算过程就是 CTC forward algroirthm 基于 Fig. 3 的左边的初始条件。\n",
"\n",
"<img src=\"./img/ctc_loss_forward_loss.png\" alt=\"image-20211108162544982\" style=\"zoom:50%;\" />\n",
"\n",
"基于Fig. 3 右边的初始条件,我们还是可以计算出一个概率, 那个就是 **CTC backward**. 这里我就不详细介绍了, 直接截图。\n",
"\n",
"<img src=\"./img/ctc_loss_backward_recurse.png\" alt=\"image-20211108162859876\" style=\"zoom:50%;\" />\n",
"\n",
"这样一直做乘法, 数字值越来越小很快就会underflow。 这个时候就需要做 scaling.\n",
"\n",
"<img src=\"./img/ctc_loss_rescale_loss.png\" alt=\"image-20211108163526616\" style=\"zoom:50%;\" />\n",
"\n",
"算出了forward probability 和 backward probability 有什么用呢, 解释如下图。\n",
"\n",
"<img src=\"./img/ctc_loss_forward_backward.png\" alt=\"image-20211108164110404\" style=\"zoom:50%;\" />\n",
"\n",
"上图是说 forward probability and backward probability 的乘积, 代表了这个 sequence $\\mathbf{l}$ t时刻是s label 的 所有paths 的概率。 这样的话 我们就计算了 Fig. 3 中的每个圆圈的概率。为什么$\\alpha_t(s)\\beta_t(s)$ 中多出一个 $y^t_{\\mathbf{l'_s}}$ ,这是因为它在 $\\alpha$ 和 $\\beta$ 中都包含该项,合并公式后就多出一项。\n",
"\n",
"<img src=\"./img/ctc_loss_forward_backward_to_loss.png\" alt=\"image-20211109143104052\" style=\"zoom:50%;\" />\n",
"\n",
"$p(\\mathbf{l}|\\mathbf{x})$ 可以通过任意时刻 t 的所有 s 的 foward-backward 概率计算得来。取负对数后就是单个样本的NLLNegative Log Likelihood。\n",
"\n",
"### 总结\n",
"\n",
"总结一下根据前向概率计算CTCLoss函数可以得出如下结论\n",
"\n",
"1. 对于时序长度为T的输入序列x和输出序列z前向概率\n",
" $$\n",
" \\begin{split}\n",
" \\alpha_t(s) &= \\sum_{ \\underset{\\pi_t=l'_s}{\\pi \\in \\mathcal{B}^{-1}(z)} } p(\\pi_{1:t}|x) \\newline\n",
" \\alpha_1(1) &= y_{-}^1 ; \\quad \\alpha_1(2)=y^1_{l'_2}, \\quad \\alpha_1(s)=0, \\forall s > 2 \\newline\n",
" \\alpha_t(s) &= 0, \\quad \\forall s < |l'| - 2(T-t) - 1 ,\\quad \\text{or} \\quad \\forall s < 1 \\newline\n",
" \\alpha_t(s) &=\n",
" \\begin{cases}\n",
" (\\alpha_{t-1}(s) + \\alpha_{t-1}(s-1) ) y^t_{l'_s} & \\text{if $l'_s=b$ or $l'_{s-2} = l'_s$} \\newline\n",
" (\\alpha_{t-1}(s) + \\alpha_{t-1}(s-1) + \\alpha_{t-1}(s-2))y^t_{l'_s} & \\text{otherwise}\\newline\n",
" \\end{cases} \n",
" \\end{split}\n",
" $$\n",
"\n",
"2. 利用 $\\alpha_t(s)$ 计算CTCLoss\n",
" $$\n",
" -ln(p(l \\mid x)) = -ln(\\alpha_{T}(|l'|)+\\alpha_{T}(|l'|-1))\n",
" $$\n",
"\n",
"根据后向概率计算CTCLoss函数可以得出如下结论\n",
"\n",
"1. 对于时序长度为T的输入序列x和输出序列z后向概率 \n",
" $$\n",
" \\begin{split}\n",
" \\beta_t(s) &= \\sum_{ \\underset{\\pi_t=l'_s}{\\pi \\in \\mathcal{B}^{-1}(z)} } p(\\pi_{t:T}|x) \\newline\n",
" \\beta_T(|l'|) &= y_{-}^T ; \\quad \\beta_T(|l'|-1)=y^T_{l'_{|l'|-1}}, \\quad \\beta_T(s)=0, \\forall s < |l'| - 1 \\newline\n",
" \\beta_t(s) &= 0, \\text{$\\forall s > 2t$ or $\\forall s < |l'|$} \\newline\n",
" \\beta_t(s) &=\n",
" \\begin{cases}\n",
" (\\beta_{t+1}(s) + \\beta_{t+1}(s+1) ) y^t_{l'_s} & \\text{if $l'_s=b$ or $l'_{s+2} = l'_s$} \\newline\n",
" (\\beta_{t+1}(s) + \\beta_{t+1}(s+1) + \\beta_{t+1}(s+2))y^t_{l'_s} & \\text{otherwise}\\newline\n",
" \\end{cases}\n",
" \\end{split}\n",
" $$\n",
"\n",
" 2. 利用 $\\beta_t(s)$计算CTCLoss\n",
"\n",
"$$\n",
"-ln(p(l \\mid x)) = -ln(\\beta_{1}(1)+\\beta_{1}(2)) \\newline\n",
"$$\n",
"\n",
"根据任意时刻的前向概率和后向概率计算CTC Loss函数得到如下结论\n",
"\n",
"1. 对于任意时刻t利用前向概率和后向概率计算CTCLoss\n",
"\n",
"$$\n",
"p(l \\mid x) = \\sum_{s=1}^{|l'|} \\frac{\\alpha_t(s)\\beta_t(s)}{y_{l'_s}^t} \\newline\n",
"-ln(p(l \\mid x)) = -ln( \\sum_{s=1}^{|l'|} \\frac{\\alpha_t(s) \\beta_t(s)}{y_{l'_s}^t} )\n",
"$$\n",
"我们已经得到CTCLoss的计算方法接下来对其进行求导。\n"
]
},
{
"cell_type": "markdown",
"id": "viral-fitting",
"metadata": {},
"source": [
"## CTC梯度计算\n",
"\n",
"### 微分公式\n",
"\n",
"在计算梯度前,我们先回顾下基本的微分公式: \n",
"$$\n",
"C' = 0 \\\\\n",
"x' = 1 \\newline\n",
"x^n = n \\cdot x^{n-1} \\newline\n",
"(e^x)' = e^x \\newline\n",
"log(x)' = \\frac{1}{x} \\newline\n",
"(u + v)' = u' + v' \\newline\n",
"(\\frac{u}{v})' = \\frac{u'v-uv'}{v^2} \\newline\n",
"\\frac{\\mathrm{d}f(g(x))}{\\mathrm{d}x} = \\frac{\\mathrm{d}f(g(x))}{\\mathrm{d}g(x)} \\cdot \\frac{\\mathrm{d}g(x)}{\\mathrm{d}x}\n",
"$$\n"
]
},
{
"cell_type": "markdown",
"id": "starting-sender",
"metadata": {},
"source": [
"### CTC梯度\n",
"\n",
"最大似然估计训练就是最大化训练集中每一个分类的对数概率即最小化Eq. 12。\n",
"\n",
"<img src=\"./img/ctc_loss_gradient_of_y_hat.png\" alt=\"image-20211108164206136\" style=\"zoom:50%;\" />\n",
"\n",
"最后就是算微分了, 整个推导过程就是加法和乘法, 都可以微分。 $\\mathit{O}^{ML}$关于神经网络的输出 $y^t_k$的梯度见Eq. 13。因为训练样本是相互独立的所以可以单独考虑每个样本公式如Eq.13。\n",
"\n",
"下面是CTCLoss的梯度计算\n",
"\n",
"<img src=\"./img/ctc_loss_gradient_with_y.png\" alt=\"image-20211109143622448\" style=\"zoom:50%;\" />\n"
]
},
{
"cell_type": "markdown",
"id": "stretch-order",
"metadata": {},
"source": [
"### CTC梯度推导\n",
"\n",
"回顾下之前的公式,便于理解后续推导过程。 \n",
"\n",
"$$\n",
"p(l \\mid x) = \\sum_{s=1}^{|l'|} \\frac{\\alpha_t(s)\\beta_t(s)}{y_{l'_s}^t} \\\\\n",
"\\begin{equation}\n",
"\\alpha_t(s) \\beta_t(s) = \\sum_{ \\underset{\\pi_t=l'_s}{\\pi \\in \\mathcal{B}^{-1}(l):} } y^t_{l'_s} \\prod_{t=1}^T y^t_{\\pi_t}\n",
"\\end{equation}\n",
"$$\n",
"\n",
"其中Eq. 15的计算过程如下 \n",
"\n",
"$$\n",
"\\begin{align*}\n",
"\\frac{\\partial p(\n",
"l \\mid x)}{\\partial y_k^t}\n",
" & = \\sum_{s \\in lab(z,k)} \\frac{ \\partial \\frac{ \\alpha_t(s) \\beta_t(s)}{y_{k}^t}}{\\partial y_k^t} \n",
" \\newline\n",
" & = \\sum_{s \\in lab(z,k)} \\frac{(\\alpha_t(s)\\beta_t(s))y_k^t - \\alpha_t(s)\\beta_t(s){y_k^t}'}{{y_k^t}^2}\n",
" \\newline\n",
" &= \\sum_{s \\in lab(z,k)} \\frac{( \\prod_{t'=1}^{t-1} y^{t'}_{\\pi_{t'}} \\cdot y_k^t \\cdot y_k^t \\cdot \\prod_{t'=t+1}^{T} y^{t'}_{\\pi_{t'}} ) y_k^t - \\alpha_t(s)\\beta_t(s){y_k^t}'}{{y_k^t}^2}\n",
" \\newline\n",
" &= \\sum_{s \\in lab(z,k)} \\frac{2\\alpha_t(s)\\beta_t(s) - \\alpha_t(s)\\beta_t(s)}{{y_k^t}^2}\n",
" \\newline\n",
" &= \\sum_{s \\in lab(z,k)} \\frac{\\alpha_t(s)\\beta_t(s)}{{y_k^t}^2}\n",
" \\newline\n",
" &= \\frac{1}{{y_k^t}^2} \\sum_{s \\in lab(z,k)} \\alpha_t(s)\\beta_t(s) \\tag{1} \\newline\n",
"\\end{align*}\n",
"$$\n",
"\n",
"\n",
"NLL的公式推导如下\n",
"$$\n",
"\\begin{split}\n",
"\\frac{\\partial {ln(p(l \\mid x))} }{ \\partial y^t_k }\n",
" &= \\frac{1}{p(l \\mid x)} \\frac{ \\partial{p(l \\mid x)} }{ \\partial y_k^t } \\newline\n",
" &= \\frac{1}{p(l \\mid x) {y^t_k}^2 } \\sum_{s \\in lab(z,k)} \\alpha_t(s)\\beta_t(s) \n",
"\\end{split}\n",
"\\tag{2}\n",
"$$\n",
"\n",
"\n",
"已经算出了CTCLoss对于 $y_k^t$ 的梯度,接下来我们需要计算 CTCLoss对于$u^t_k$logits的梯度。套用链式法则并替换$y^t_k$ 为 $y^t_{k'}$​,结果如下图。图中 $k'$ 表示vocab中的某一个token$K$ 是vocab的大小。\n",
"\n",
"![](./img/ctc_loss_backward_2.png)\n",
"\n",
"图中公式4根据链式法则得到\n",
"$$\n",
"- \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k }\n",
" = - \\sum_{k'=1}^{K} \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial y^t_{k'} } \\frac{ \\partial y^t_{k'} }{ \\partial u^t_k } \\tag{4}\n",
"$$\n",
"图中公式3是softmax的梯度参考 [4],计算过程如下:\n",
"$$\n",
"softmax(j) = S_j = \\frac{ e^{a_j} }{ \\sum_{k=1}^K e^{a_k} }, \\enspace \\forall j \\in 1 \\dots K\n",
"$$\n",
"\n",
"$$\n",
"\\begin{split}\n",
"\\frac{ \\partial S_i }{ \\partial a_j}\n",
" &= \\frac{ \\partial (\\frac{ e^{ a_i } }{ \\sum_k e^{ a_k } }) } { \\partial a_j }\n",
" \\newline\n",
" &= \n",
" \\begin{cases}\n",
" \t\\frac{ e^a_i \\sum - e^a_j e^a_i }{ \\sum^2 } \n",
" \t&= \\frac{ e^a_i }{ \\sum } \\frac{ \\sum - e^a_j }{ \\sum } \\newline\n",
" &= S_i(1-S_j) & \\text{i = j, $\\sum$ stands for $\\sum_{k=1}^K e^a_k$} \n",
" \t\\newline\n",
" \t\\frac{ 0 - e^a_j e^a_i }{ \\sum^2 } \n",
" \t&= - \\frac{ e^a_j }{ \\sum } \\frac{ e^a_i }{ \\sum } \\newline\n",
" &= -S_j S_i & \\text{i $\\neq$ j, $\\sum$ stands for $\\sum_{k=1}^K e^a_k$}\n",
" \\end{cases}\n",
" \\newline\n",
" &= \n",
" \\begin{cases}\n",
" S_i(1 - S_j) & \\text{$i = j$} \n",
" \\newline\n",
" -S_j S_i = S_i (0 - S_j) & \\text{$i \\neq j$}\n",
" \\end{cases}\n",
" \\newline\n",
" &= S_i (\\delta_{ij} - S_j )\n",
"\\end{split}\n",
"\\tag{3}\n",
"$$\n",
"$$\n",
"\\delta_{ij} =\n",
" \\begin{cases}\n",
" 1 & \\text{if i = j} \\newline\n",
" 0 & \\text{otherwise}\n",
" \\end{cases}\n",
"$$\n",
"\n",
"\n",
"\n",
"下图中黄色框中的部分表示公式1即遍历所有的vocab中的token其结果是$p(l \\mid x)$。这是因为label $l$ 中的token一定在vocab中且 $s \\in lab(l, k')$ 可以是空集。当 $k'$ 在 l 中s 则为label中token是$k'$​的概率;当$k'$不在l中s为空概率为0。\n",
"\n",
"![img](./img/ctc_loss_backward_3.png)\n",
"\n",
"公式23带入4并结合公式1的结果如上图右边\n",
"$$\n",
"\\begin{split}\n",
"- \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k } &= \n",
"\t- \\sum_{k'=1}^K \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial y^t_{k'} } \\frac{ \\partial y^t_{k'}}{ \\partial u^t_k } \\newline\n",
"\t&= - \\sum_{k'=1}^K \\frac{ y^t_{k'}( \\delta_{kk'} - y^t_k ) }{ p(l \\mid x) {y^t_{k'}}^2 } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n",
"\t&= - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} - y^t_k }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n",
"\t&= \\sum_{k'=1}^K \\frac{ y^t_k - \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n",
"\t&= \\sum_{k'=1}^K \\frac{ y^t }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n",
"\t&= \\frac{ y^t_k }{ p(l \\mid x) } ( \\sum_{k'=1}^K \\frac{1}{y^t_{k'}} \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) ) - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n",
"\t&= \\frac{ y^t_k }{ p(l \\mid x) } p(l \\mid x) - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n",
"\t&= y^t_k - \\frac{ 1 }{ p(l \\mid x) y^t_k } \\sum_{s \\in lab(l, k)} \\alpha_t(s) \\beta_t(s) \\newline\n",
"\\end{split}\n",
"$$\n",
"最终为了通过softmax层传播CTCLoss的梯度需要计算目标函数与 logits $u^t_k$ 的偏微分即Eq. 16: \n",
" $$\n",
" \\begin{align*}\n",
" \\hat{\\alpha}_t(s) & \\overset{def}{=} \\frac{ \\alpha_t(s) }{ C_t } ,\\enspace C_t \\overset{def}{=} \\sum_s \\alpha_t(s) \n",
" \\newline\n",
" \\hat{\\beta}_t(s) & \\overset{def}{=} \\frac{ \\beta_t(s) }{ D_t } ,\\enspace D_t \\overset{def}{=} \\sum_s \\beta_t(s) \n",
" \\newline\n",
" - \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k } &= y^t_k - \\frac{1}{y^t_k \\sum_{s=1}^{\\mid l' \\mid} \\frac{ \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) }{ y^t_{l'_s} } } \\sum_{s \\in lab(l, k)} \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) \\tag{16} \n",
" \\newline\n",
" \\end{align*}\n",
" $$"
]
},
{
"cell_type": "markdown",
"id": "informative-maria",
"metadata": {},
"source": [
"### 总结\n",
"\n",
"* 通过动态规划算法计算$\\alpha_t(s)$ 和 $\\beta_t(s)$\n",
"\n",
"* 通过$\\alpha_t(s)$ 计算 $p(l \\mid x)=\\alpha_T(\\mid l' \\mid) + \\alpha_T(\\mid l' \\mid -1)$\n",
"\n",
"* 通过$\\alpha_t(s)$ 和 $\\beta_t(s)$\n",
"\n",
"* 计算CTcLoss函数的导数: \n",
" $$\n",
" \\begin{split}\n",
" - \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k } \n",
" &= y^t_k - \\frac{ 1 }{ p(l \\mid x) y^t_k } \\sum_{s \\in lab(l, k)} \\alpha_t(s) \\beta_t(s) \n",
" \\newline\n",
" &= y^t_k - \\frac{1}{y^t_k \\sum_{s=1}^{\\mid l' \\mid} \\frac{ \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) }{ y^t_{l'_s} } } \\sum_{s \\in lab(l, k)} \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) \n",
" \\newline\n",
" \\end{split}\n",
" \\tag{16}\n",
" $$"
]
},
{
"cell_type": "markdown",
"id": "coordinated-music",
"metadata": {},
"source": [
"## Reference\n",
"\n",
"[[1] A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber. Connectionist Temporal lassification: Labeling Unsegmented Sequence Data with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)\n",
"\n",
"[[2] Sequence ModelingWith CTC](https://distill.pub/2017/ctc/)\n",
"\n",
"[[3] NLP 之 CTC Loss 的工作原理](https://www.jianshu.com/p/e073c9d91b20)\n",
"\n",
"[[4] The Softmax function and its derivative](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)\n",
"\n",
"[[5] CTC Algorithm Explained Part 1Training the NetworkCTC算法详解之训练篇](https://xiaodu.io/ctc-explained/)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "closing-candy",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 223 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 224 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 581 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 368 KiB

@ -0,0 +1,958 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Story Talker"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 用 PaddleOCR 识别图片中的文字"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from PIL import Image\n",
"img_path = 'source/frog_prince.jpg'\n",
"im = Image.open(img_path)\n",
"im.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用 TTS 合成的音频"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import IPython.display as dp\n",
"dp.Audio(\"source/ocr.wav\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font size=4>具体实现代码请参考: https://github.com/DeepSpeech/demos/story_talker/run.sh<font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 元宇宙来袭,构造你的虚拟人!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用 PaddleGAN 合成的唇形视频"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from IPython.display import HTML\n",
"html_str = '''\n",
"<video controls width=\"650\" height=\"365\" src=\"{}\">animation</video>\n",
"'''.format(\"output/tts_lips.mp4\")\n",
"dp.display(HTML(html_str))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font size=4>具体实现代码请参考: https://github.com/DeepSpeech/demos/metaverse/run.sh<font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 前言\n",
"<br></br>\n",
"近年来,随着深度学习算法上的进步以及不断丰厚的硬件资源条件,**文本转语音Text-To-Speech, TTS** 技术在智能语音助手、虚拟娱乐等领域得到了广泛的应用。本教程将结合背景知识让用户能够使用PaddlePaddle完成文本转语音任务并结合光学字符识别Optical Character RecognitionOCR、自然语言处理Natural Language ProcessingNLP等技术“听”书、让名人开口说话。\n",
"\n",
"<br></br>\n",
"## 背景知识\n",
"<br></br>\n",
"为了更好地了解文本转语音任务的要素,我们先简要地回顾一下文本转语音的发展历史。如果你对此已经有所了解,或希望能尽快使用代码实现,请跳至第二章。\n",
"\n",
"<br></br>\n",
"### 定义\n",
"<br></br>\n",
"<!----\n",
"Note: \n",
"1.此句抄自 [李沐Dive into Dive Learning](https://zh-v2.d2l.ai/chapter_introduction/index.html)\n",
"2.修改参考A survey on Neural Speech Sysnthesis.\n",
"---> \n",
"<font size=4> 文本转语音又称语音合成Speech Sysnthesis指的是将一段文本按照一定需求转化成对应的音频这种特性决定了的输出数据比输入输入长得多。文本转语音是一项包含了语义学、声学、数字信号处理以及机器学习的等多项学科的交叉任务。虽然辨识低质量音频文件的内容对人类来说很容易但这对计算机来说并非易事。\n",
"</font>\n",
"\n",
"> Note: 这里可以提供一下资料出处嘛? 2021/11/09\n",
"<br></br>\n",
"\n",
"按照不同的应用需求,更广义的语音合成研究包括:\n",
"- <font size=4>语音转换Voice Transformation/Conversion</font>\n",
" - 说话人转换\n",
" - 语音到歌唱转换Speech to Singing\n",
" - 语音情感转换\n",
" - 口音转换\n",
"- <font size=4>歌唱合成 Singing Synthesis</font>\n",
" - <font size=4>歌词到歌唱转换Text/Lyric to Singing</font>\n",
"- <font size=4>可视语音合成Visual Speech Synthesis</font>\n",
"\n",
"<br></br>\n",
"### 发展历史\n",
"<br></br>\n",
"<!--\n",
"以下摘自维基百科 https://en.wikipedia.org/wiki/Speech_synthesis\n",
"--->\n",
"#### 机械式语音合成19世纪及以前\n",
"在第二次工业革命之前语音的合成主要以机械式的音素合成为主。1779年德裔丹麦科学家 Christian Gottlieb Kratzenstein 建造了人类的声道模型使其可以产生五个长元音。1791年 Wolfgang von Kempelen 添加了唇和舌的模型,使其能够发出辅音和元音。\n",
"#### 电子语音合成20世纪30年代\n",
"贝尔实验室于20世纪30年代发明了声码器Vocoder将语音自动分解为音调和共振此项技术由 Homer Dudley 改进为键盘式合成器并于 1939年纽约世界博览会展出。\n",
"#### 电子语音合成\n",
"第一台基于计算机的语音合成系统起源于 20 世纪 50 年代。1961 年IBM 的 John Larry Kelly以及 Louis Gerstman 使用 IBM 704 计算机合成语音,成为贝尔实验室最著名的成就之一。 1975年第一代语音合成系统之一 —— MUSAMUltichannel Speaking Automation问世其由一个独立的硬件和配套的软件组成。1978年发行的第二个版本也可以进行无伴奏演唱。90 年代的主流是采用 MIT 和贝尔实验室的系统,并结合自然语言处理模型。\n",
"> Note: 这里插一张timeline图\n",
"#### 当前的主流方法\n",
"\n",
"- <font size=4>基于统计参数的语音合成</font>\n",
" - <font size=4>隐马尔可夫模型Hidden Markov Model,HMM</font>\n",
" - <font size=4>深度学习网络Deep Neural NetworkDNN</font>\n",
"- <font size=4>波形拼接语音合成</font>\n",
" \n",
"- <font size=4>混合方法</font>\n",
" - <font size=4>参数轨迹指导的波形拼接</font>\n",
"- <font size=4>端到端神经网络语音合成</font>\n",
" - <font size=4>声学模型 + 声码器</font>\n",
" - <font size=4>“完全”端到端方法</font>\n",
"\n",
"<br></br>\n",
"## 基于深度学习的语音合成技术\n",
"<br></br>\n",
"### 语音合成基本知识\n",
"<br></br>\n",
"![信号处理流水线](source/signal_pipeline.png)\n",
"<br></br>\n",
"<font size=4>语音合成流水线包含 <font color=\"#ff0000\">**文本前端Text Frontend**</font> 、<font color=\"#ff0000\">**声学模型Acoustic Model**</font> 和 <font color=\"#ff0000\">**声码器Vocoder**</font> 三个主要模块:</font>\n",
"- <font size=4>通过文本前端模块将原始文本转换为字符/音素。</font>\n",
"- <font size=4>通过声学模型将字符/音素转换为声学特征如线性频谱图、mel 频谱图、LPC 特征等。</font>\n",
"- <font size=4>通过声码器将声学特征转换为波形。</font>\n",
"<br></br>\n",
"<img style=\"float: center;\" src=\"source/tts_pipeline.png\" width=\"85%\"/>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 实践\n",
"<br></br>\n",
"<font size=4>环境安装请参考: https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md</font>\n",
"\n",
"<br></br>\n",
"\n",
"<font size=4>使用 **PaddleSpeech** 提供的预训练模型合成一句中文。</font>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 0 准备"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 获取预训练模型"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!mkdir download\n",
"!wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip\n",
"!unzip -d download download/pwg_baker_ckpt_0.4.zip\n",
"!wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip\n",
"!unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 查看预训练模型的结构"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!tree download/pwg_baker_ckpt_0.4\n",
"!tree download/fastspeech2_nosil_baker_ckpt_0.4"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 导入 Python 包"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import logging\n",
"import sys\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"# PaddleSpeech 项目根目录放到 python 路径中\n",
"sys.path.insert(0,\"../../../\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import argparse\n",
"import os\n",
"from pathlib import Path\n",
"import IPython.display as dp\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import paddle\n",
"import soundfile as sf\n",
"import yaml\n",
"from paddlespeech.t2s.frontend.zh_frontend import Frontend\n",
"from paddlespeech.t2s.models.fastspeech2 import FastSpeech2\n",
"from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference\n",
"from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator\n",
"from paddlespeech.t2s.models.parallel_wavegan import PWGInference\n",
"from paddlespeech.t2s.modules.normalizer import ZScore\n",
"from yacs.config import CfgNode"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 设置预训练模型的路径"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"fastspeech2_config = \"download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml\"\n",
"fastspeech2_checkpoint = \"download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz\"\n",
"fastspeech2_stat = \"download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy\"\n",
"pwg_config = \"download/pwg_baker_ckpt_0.4/pwg_default.yaml\"\n",
"pwg_checkpoint = \"download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz\"\n",
"pwg_stat = \"download/pwg_baker_ckpt_0.4/pwg_stats.npy\"\n",
"phones_dict = \"download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt\"\n",
"# 读取 conf 文件并结构化\n",
"with open(fastspeech2_config) as f:\n",
" fastspeech2_config = CfgNode(yaml.safe_load(f))\n",
"with open(pwg_config) as f:\n",
" pwg_config = CfgNode(yaml.safe_load(f))\n",
"print(\"========Config========\")\n",
"print(fastspeech2_config)\n",
"print(\"---------------------\")\n",
"print(pwg_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step1 文本前端\n",
"<br></br>\n",
"\n",
"<font size=4>一个文本前端模块主要包含</font>:\n",
"- <font size=4>分段Text Segmentation</font>\n",
"\n",
"- <font size=4>文本正则化Text Normalization, TN</font>\n",
"\n",
"- <font size=4>分词Word Segmentation, 主要是在中文中)</font>\n",
"\n",
"- <font size=4>词性标注Part-of-Speech, PoS</font>\n",
"- <font size=4>韵律预测Prosody</font>\n",
"- <font size=4>字音转换Grapheme-to-PhonemeG2P</font>\n",
"<br></br>\n",
"<font size=2>Grapheme: **语言**书写系统的最小有意义单位; Phoneme: 区分单词的最小**语音**单位)</font>\n",
" - <font size=4>多音字Polyphone</font>\n",
" - <font size=4>变调Tone Sandhi</font>\n",
" - <font size=4>“一”、“不”变调</font>\n",
" - <font size=4>三声变调</font>\n",
" - <font size=4>轻声变调</font>\n",
" - <font size=4>儿化音</font>\n",
" - <font size=4>方言</font>\n",
"- ...\n",
"<br></br>\n",
"\n",
"<font size=4>(输入给声学模型之前,还需要把音素序列转换为 id</font>\n",
"\n",
"<br></br>\n",
"<font size=4>其中最重要的模块是<font color=\"#ff0000\"> 文本正则化 </font>模块和<font color=\"#ff0000\"> 字音转换TTS 中更常用 G2P代指 </font>模块。</font>\n",
"\n",
"<br></br>\n",
"\n",
"<font size=4>各模块输出示例:</font>\n",
"```text\n",
"• Text: 全国一共有112所211高校\n",
"• Text Normalization: 全国一共有一百一十二所二一一高校\n",
"• Word Segmentation: 全国/一共/有/一百一十二/所/二一一/高校/\n",
"• G2P注意此句中“一”的读音:\n",
" quan2 guo2 yi2 gong4 you3 yi4 bai3 yi1 shi2 er4 suo3 er4 yao1 yao1 gao1 xiao4\n",
" (可以进一步把声母和韵母分开)\n",
" q uan2 g uo2 y i2 g ong4 y ou3 y i4 b ai3 y i1 sh i2 er4 s uo3 er4 y ao1 y ao1 g ao1 x iao4\n",
" (把音调和声韵母分开)\n",
" q uan g uo y i g ong y ou y i b ai y i sh i er s uo er y ao y ao g ao x iao\n",
" 0 2 0 2 0 2 0 4 0 3 ...\n",
"• Prosody (prosodic words #1, prosodic phrases #2, intonation phrases #3, sentence #4):\n",
" 全国#2一共有#2一百#1一十二所#2二一一#1高校#4\n",
" (分词的结果一般是固定的,但是不同人习惯不同,可能有不同的韵律)\n",
"```\n",
"\n",
"<br></br>\n",
"<font size=4>文本前端模块的设计需要融入很多专业的或经验性的知识,人类在读文本的时候可以自然而然地读出正确的发音,但是这些计算机都是不知道的!</font>\n",
"\n",
"<br></br>\n",
"<font size=4>分词:</font>\n",
"```text\n",
"我也想过过过儿过过的生活\n",
"我也想/过过/过儿/过过的/生活\n",
"\n",
"货拉拉拉不拉拉布拉多\n",
"货拉拉/拉不拉/拉布拉多\n",
"\n",
"南京市长江大桥\n",
"南京市长/江大桥\n",
"南京市/长江大桥\n",
"```\n",
"<font size=4>变调和儿化音:</font>\n",
"```\n",
"你要不要和我们一起出去玩?\n",
"你要不2声要和我们一4声起出去玩\n",
"\n",
"不好,我要一个人出去。\n",
"不4声我要一2声个人出去。\n",
"\n",
"(以下每个词的所有字都是三声的,请你读一读,体会一下在读的时候,是否每个字都被读成了三声?)\n",
"纸老虎、虎骨酒、展览馆、岂有此理、手表厂有五种好产品\n",
"```\n",
"<font size=4>多音字(通常需要先正确分词):</font>\n",
"```text\n",
"人要行,干一行行一行,一行行行行行;\n",
"人要是不行,干一行不行一行,一行不行行行不行。\n",
"\n",
"佟大为妻子产下一女\n",
"\n",
"海水朝朝朝朝朝朝朝落\n",
"浮云长长长长长长长消\n",
"```\n",
"<br></br>\n",
"\n",
"<font size=4>PaddleSpeech TTS 文本前端解决方案:</font>\n",
"- <font size=4>文本正则: 规则</font>\n",
"- <font size=4>G2P:</font>\n",
" - <font size=4>多音字模块: pypinyin/g2pM</font>\n",
" - <font size=4>变调模块: 用分词 + 规则</font>\n",
"\n",
"<br></br>\n",
"<font size=4>相关 examples:\n",
" \n",
"https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tn\n",
"https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/g2p</font>\n",
"\n",
"<br></br>\n",
"<font size=4>(未来计划推出基于深度学习的文本前端模块)</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 构造文本前端对象"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
"frontend = Frontend(phone_vocab_path=phones_dict)\n",
"print(\"Frontend done!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 调用文本前端"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"input = \"你好,欢迎使用百度飞桨框架进行深度学习研究!\"\n",
"# input = \"我每天中午12:00起床\"\n",
"# input = \"我出生于2005/11/08那天的最低气温达到-10°C\"\n",
"# text norm 时会进行分句merge_sentences 表示把分句的结果合成一条\n",
"# 可以把 merge_sentences 设置为 False, 多个子句并行调用声学模型和声码器提升合成速度\n",
"input_ids = frontend.get_input_ids(input, merge_sentences=True, print_info=True)\n",
"# 由于 merge_sentences=True, input_ids[\"phone_ids\"][0] 即表示整句的 phone_ids\n",
"phone_ids = input_ids[\"phone_ids\"][0]\n",
"print(\"phone_ids:\")\n",
"print(phone_ids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step1+ 文本前端深度学习化\n",
"<br></br>\n",
"<img style=\"float: center;\" src=\"source/text_frontend_struct.png\" width=\"100%\"/>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step2 声学模型\n",
"<br></br>\n",
"<font size=4>声学模型将字符/音素转换为声学特征如线性频谱图、mel 频谱图、LPC 特征等,声学特征以 “帧” 为单位,一般一帧是 10ms 左右,一个音素一般对应 5~20 帧左右, 声学模型需要解决的是 <font color=\"#ff0000\">“不等长序列间的映射问题”</font>,“不等长”是指,同一个人发不同音素的持续时间不同,同一个人在不同时刻说同一句话的语速可能不同,对应各个音素的持续时间不同,不同人说话的特色不同,对应各个音素的持续时间不同。这是一个困难的“一对多”问题。</font>\n",
"```\n",
"# 卡尔普陪外孙玩滑梯\n",
"000001|baker_corpus|sil 20 k 12 a2 4 er2 10 p 12 u3 12 p 9 ei2 9 uai4 15 s 11 uen1 12 uan2 14 h 10 ua2 11 t 15 i1 16 sil 20\n",
"```\n",
"\n",
"<font size=4>声学模型主要分为自回归模型和非自回归模型,其中自回归模型在 `t` 时刻的预测需要依赖 `t-1` 时刻的输出作为输入,预测时间长,但是音质相对较好,非自回归模型不存在预测上的依赖关系,预测时间快,音质相对较差。</font>\n",
"\n",
"<br></br>\n",
"<font size=4>主流声学模型发展的脉络:</font>\n",
"- <font size=4>自回归模型:</font>\n",
" - <font size=4>Tacotron</font>\n",
" - <font size=4>Tacotron2</font>\n",
" - <font size=4>Transformer TTS</font>\n",
"- <font size=4>非自回归模型:</font>\n",
" - <font size=4>FastSpeech</font>\n",
" - <font size=4>SpeedySpeech</font>\n",
" - <font size=4>FastPitch</font>\n",
" - <font size=4>FastSpeech2</font>\n",
" - ...\n",
" \n",
"<br></br>\n",
"<font size=4>在本教程中,我们使用 `FastSpeech2` 作为声学模型。<font>\n",
"![FastSpeech2](source/fastspeech2.png)\n",
"<font size=4>PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。<font>\n",
"![FastPitch](source/fastpitch.png)\n",
"<font size=4>更多关于声学模型的发展及改进的介绍: https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html<font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 初始化声学模型 FastSpeech2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with open(phones_dict, \"r\") as f:\n",
" phn_id = [line.strip().split() for line in f.readlines()]\n",
"vocab_size = len(phn_id)\n",
"print(\"vocab_size:\", vocab_size)\n",
"odim = fastspeech2_config.n_mels\n",
"model = FastSpeech2(\n",
" idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
"# 预训练好的参数赋值给模型\n",
"model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n",
"# 推理阶段不启用 batch norm 和 dropout\n",
"model.eval()\n",
"# 读取数据预处理阶段数据集的均值和标准差\n",
"stat = np.load(fastspeech2_stat)\n",
"mu, std = stat\n",
"mu = paddle.to_tensor(mu)\n",
"std = paddle.to_tensor(std)\n",
"fastspeech2_normalizer = ZScore(mu, std)\n",
"# 构造包含 normalize 的新模型\n",
"fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
"fastspeech2_inference.eval()\n",
"print(\"FastSpeech2 done!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 调用声学模型"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with paddle.no_grad():\n",
" mel = fastspeech2_inference(phone_ids)\n",
"print(\"shepe of mel (n_frames x n_mels):\")\n",
"print(mel.shape)\n",
"# 绘制声学模型输出的 mel 频谱\n",
"fig, ax = plt.subplots(figsize=(9, 6))\n",
"im = ax.imshow(mel.T, aspect='auto',origin='lower')\n",
"fig.colorbar(im, ax=ax)\n",
"plt.title('Mel Spectrogram')\n",
"plt.xlabel('Time')\n",
"plt.ylabel('Frequency')\n",
"plt.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step3 声码器\n",
"<br></br>\n",
"<font size=4>声码器将声学特征转换为波形。声码器需要解决的是 <font color=\"#ff0000\">“信息缺失的补全问题”</font>。信息缺失是指,在音频波形转换为频谱图的时候,存在**相位信息**的缺失,在频谱图转换为 mel 频谱图的时候,存在**频域压缩**导致的信息缺失假设音频的采样率是16kHZ, 一帧的音频有 10ms也就是说1s 的音频有 16000 个采样点,而 1s 中包含 100 帧,每一帧有 160 个采样点,声码器的作用就是将一个频谱帧变成音频波形的 160 个采样点,所以声码器中一般会包含**上采样**模块。<font>\n",
" \n",
"<br></br>\n",
"<font size=4>与声学模型类似,声码器也分为自回归模型和非自回归模型, 更细致的分类如下:<font>\n",
"\n",
"- <font size=4>Autoregression<font>\n",
" - <font size=4>WaveNet<font>\n",
" - <font size=4>WaveRNN<font>\n",
" - <font size=4>LPCNet<font>\n",
"- <font size=4>Flow<font>\n",
" - <font size=4>WaveFlow<font>\n",
" - <font size=4>WaveGlow<font>\n",
" - <font size=4>FloWaveNet<font>\n",
" - <font size=4>Parallel WaveNet<font>\n",
"- <font size=4>GAN<font>\n",
" - <font size=4>WaveGAN<font>\n",
" - <font size=4>arallel WaveGAN<font>\n",
" - <font size=4>MelGAN<font>\n",
" - <font size=4>HiFi-GAN<font>\n",
"- <font size=4>VAE\n",
" - <font size=4>Wave-VAE<font>\n",
"- <font size=4>Diffusion<font>\n",
" - <font size=4>WaveGrad<font>\n",
" - <font size=4>DiffWave<font>\n",
"\n",
"<br></br>\n",
"<font size=4>PaddleSpeech TTS 主要实现了百度的 `WaveFlow` 和一些主流的 GAN Vocoder, 在本教程中,我们使用 `Parallel WaveGAN` 作为声码器。<font>\n",
"\n",
"<br></br> \n",
"<img style=\"float: center;\" src=\"source/pwgan.png\" width=\"75%\"/> \n",
"\n",
"<br></br>\n",
"<font size=4>各 GAN Vocoder 的生成器和判别器的 Loss 的区别如下表格所示:<font>\n",
" \n",
"Model | Generator Loss |Discriminator Loss\n",
":-------------:| :------------:| :-----\n",
"Parallel Wave GAN| adversial loss <br> Feature Matching | Multi-Scale Discriminator |\n",
"Mel GAN |adversial loss <br> Multi-resolution STFT loss | adversial loss|\n",
"Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|\n",
"HiFi GAN |adversial loss <br> Feature Matching <br> Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminator|\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 初始化声码器 Parallel WaveGAN"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
"# 预训练好的参数赋值给模型\n",
"vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"])\n",
"vocoder.remove_weight_norm()\n",
"# 推理阶段不启用 batch norm 和 dropout\n",
"vocoder.eval()\n",
"# 读取数据预处理阶段数据集的均值和标准差\n",
"stat = np.load(pwg_stat)\n",
"mu, std = stat\n",
"mu = paddle.to_tensor(mu)\n",
"std = paddle.to_tensor(std)\n",
"pwg_normalizer = ZScore(mu, std)\n",
"# 构造包含 normalize 的新模型\n",
"pwg_inference = PWGInference(pwg_normalizer, vocoder)\n",
"pwg_inference.eval()\n",
"print(\"Parallel WaveGAN done!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 调用声码器"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"with paddle.no_grad():\n",
" wav = pwg_inference(mel)\n",
"print(\"shepe of wav (time x n_channels):\")\n",
"print(wav.shape)\n",
"# 绘制声码器输出的波形图\n",
"wave_data = wav.numpy().T\n",
"time = np.arange(0, wave_data.shape[1]) * (1.0 / fastspeech2_config.fs)\n",
"fig, ax = plt.subplots(figsize=(9, 6))\n",
"plt.plot(time, wave_data[0])\n",
"plt.title('Waveform')\n",
"plt.xlabel('Time (seconds)')\n",
"plt.ylabel('Amplitude (normed)')\n",
"plt.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 播放音频"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"dp.Audio(wav.numpy().T, rate=fastspeech2_config.fs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 保存音频"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!mkdir output\n",
"sf.write(\n",
" \"output/output.wav\",\n",
" wav.numpy(),\n",
" samplerate=fastspeech2_config.fs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step4 FastSpeech2 进阶 —— 个性化调节\n",
"<br></br>\n",
"<font size=3>FastSpeech2 模型可以个性化地调节音素时长、音调和能量,通过一些简单的调节就可以获得一些有意思的效果<font>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 不要听信别人的谗言,我不是什么克隆人。\n",
"print(\"原始音频\")\n",
"dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_001.wav\"))\n",
"print(\"speed x 1.2\")\n",
"dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_001.wav\"))\n",
"print(\"speed x 0.8\")\n",
"dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_001.wav\"))\n",
"print(\"pitch x 1.3(童声)\")\n",
"dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/001.wav\"))\n",
"print(\"robot\")\n",
"dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/001.wav\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font size=4>具体实现代码请参考: https://github.com/DeepSpeech/demos/style_fs2/run.sh<font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br></br>\n",
"# 用 PaddleSpeech 训练 TTS 模型\n",
"<br></br>\n",
"<font size=3>PaddleSpeech 的 examples 是按照 数据集/模型 的结构安排的:<font>\n",
"```text\n",
"examples \n",
"|-- aishell3\n",
"| |-- README.md\n",
"| |-- tts3\n",
"| `-- vc0\n",
"|-- csmsc\n",
"| |-- README.md\n",
"| |-- tts2\n",
"| |-- tts3\n",
"| |-- voc1\n",
"| `-- voc3\n",
"```\n",
"<font size=3>我们在每个数据集的 README.md 介绍了子目录和模型的对应关系, 在 TTS 中有如下对应关系:<font>\n",
"```text\n",
"tts0 - Tactron2\n",
"tts1 - TransformerTTS\n",
"tts2 - SpeedySpeech\n",
"tts3 - FastSpeech2\n",
"voc0 - WaveFlow\n",
"voc1 - Parallel WaveGAN\n",
"voc2 - MelGAN\n",
"voc3 - MultiBand MelGAN\n",
"```\n",
"<br></br>\n",
"## 基于 CSMCS 数据集训练 FastSpeech2 模型\n",
"```bash\n",
"git clone https://github.com/PaddlePaddle/PaddleSpeech.git\n",
"cd examples/csmsc/tts\n",
"```\n",
"<font size=3>根据 README.md, 下载 CSMCS 数据集和其对应的强制对齐文件, 并放置在对应的位置<font>\n",
"```bash\n",
"./run.sh\n",
"```\n",
"<font size=3>`run.sh` 中包含预处理、训练、合成、静态图推理等步骤:</font>\n",
"\n",
"```bash\n",
"#!/bin/bash\n",
"set -e\n",
"source path.sh\n",
"gpus=0,1\n",
"stage=0\n",
"stop_stage=100\n",
"conf_path=conf/default.yaml\n",
"train_output_path=exp/default\n",
"ckpt_name=snapshot_iter_153.pdz\n",
"\n",
"# with the following command, you can choice the stage range you want to run\n",
"# such as `./run.sh --stage 0 --stop-stage 0`\n",
"# this can not be mixed use with `$1`, `$2` ...\n",
"source ${MAIN_ROOT}/utils/parse_options.sh || exit 1\n",
"\n",
"if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then\n",
" # prepare data\n",
" bash ./local/preprocess.sh ${conf_path} || exit -1\n",
"fi\n",
"if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then\n",
" # train model, all `ckpt` under `train_output_path/checkpoints/` dir\n",
" CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1\n",
"fi\n",
"if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then\n",
" # synthesize, vocoder is pwgan\n",
" CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n",
"fi\n",
"if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then\n",
" # synthesize_e2e, vocoder is pwgan\n",
" CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n",
"fi\n",
"if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then\n",
" # inference with static model\n",
" CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1\n",
"fi\n",
"```\n",
"<br></br>\n",
"## 基于 CSMCS 数据集训练 Parallel WaveGAN 模型\n",
"```bash\n",
"git clone https://github.com/PaddlePaddle/PaddleSpeech.git\n",
"cd examples/csmsc/voc1\n",
"```\n",
"<font size=3>根据 README.md, 下载 CSMCS 数据集和其对应的强制对齐文件, 并放置在对应的位置<font>\n",
"```bash\n",
"./run.sh\n",
"```\n",
"<font size=3>`run.sh` 中包含预处理、训练、合成等步骤:</font>\n",
"```bash\n",
"#!/bin/bash\n",
"set -e\n",
"source path.sh\n",
"gpus=0,1\n",
"stage=0\n",
"stop_stage=100\n",
"conf_path=conf/default.yaml\n",
"train_output_path=exp/default\n",
"ckpt_name=snapshot_iter_5000.pdz\n",
"\n",
"# with the following command, you can choice the stage range you want to run\n",
"# such as `./run.sh --stage 0 --stop-stage 0`\n",
"# this can not be mixed use with `$1`, `$2` ...\n",
"source ${MAIN_ROOT}/utils/parse_options.sh || exit 1\n",
"\n",
"if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then\n",
" # prepare data\n",
" ./local/preprocess.sh ${conf_path} || exit -1\n",
"fi\n",
"if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then\n",
" # train model, all `ckpt` under `train_output_path/checkpoints/` dir\n",
" CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1\n",
"fi\n",
"if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then\n",
" # synthesize\n",
" CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n",
"fi\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# FAQ\n",
"\n",
"- <font size=3>需要注意的问题<font>\n",
"- <font size=3>经验与分享<font>\n",
"- <font size=3>用户的其他问题<font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 作业\n",
"<font size=4>在 CSMSC 数据集上利用 FastSpeech2 和 Parallel WaveGAN 实现一个中文 TTS 系统<font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 关注 PaddleSpeech\n",
"<font size=3>https://github.com/PaddlePaddle/PaddleSpeech/<font>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.0 64-bit ('yt_py37_develop': venv)",
"language": "python",
"name": "python37064bitytpy37developvenv88cd689abeac41d886f9210a708a170b"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "263.594px"
},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -46,10 +46,10 @@ model:
ctc_grad_norm_type: null
training:
n_epoch: 50
n_epoch: 65
accum_grad: 1
lr: 2e-3
lr_decay: 0.9 # 0.83
lr: 5e-4
lr_decay: 0.93
weight_decay: 1e-06
global_grad_clip: 3.0
log_interval: 100
@ -63,7 +63,7 @@ decoding:
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.2 #1.9
beta: 5.0
beta: 4.3
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40

@ -9,12 +9,13 @@ URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
MD5="29e02312deb2e59b3c8686c7966d4fe3"
TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
echo "Download language model ..."
download $URL $MD5 $TARGET
echo "Start downloading the language model. The language model is large, please wait for a moment ..."
download $URL $MD5 $TARGET > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Fail to download the language model!"
exit 1
else
echo "Download the language model sucessfully"
fi

@ -13,7 +13,7 @@ ckpt_prefix=$2
model_type=$3
# download language model
bash local/download_lm_ch.sh > /dev/null 2>&1
bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then
exit 1
fi

@ -5,9 +5,9 @@ source path.sh
gpus=0,1,2,3
stage=0
stop_stage=100
conf_path=conf/deepspeech2.yaml
conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
avg_num=1
model_type=offline
model_type=offline # offline or online
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

@ -2,6 +2,7 @@
source path.sh
set -e
gpus=0,1,2,3
stage=0
stop_stage=100
conf_path=conf/conformer.yaml
@ -22,7 +23,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -40,18 +41,19 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
echo "warning: deps on kaldi and srilm, please make sure installed."
# train lm and build TLG
./local/tlg.sh --corpus aishell --lmtype srilm
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi

@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
```
### Get MFA result of AISHELL-3 and Extract it
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
@ -67,8 +67,8 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
[--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
[--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT]
Train a FastSpeech2 model.
@ -81,8 +81,7 @@ optional arguments:
dev data.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--nprocs NPROCS number of processes.
--ngpu NGPU if ngpu=0, use cpu.
--verbose VERBOSE verbose.
--phones-dict PHONES_DICT
phone vocabulary file.
@ -92,23 +91,22 @@ optional arguments:
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
5. `--phones-dict` is the path of the phone vocabulary file.
6. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2.
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
unzip pwg_baker_ckpt_0.4.zip
unzip pwg_aishell3_ckpt_0.5.zip
```
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_baker_ckpt_0.4
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
pwg_aishell3_ckpt_0.5
├── default.yaml # default config used to train parallel wavegan
├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
```
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
@ -122,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
[--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
[--device DEVICE] [--verbose VERBOSE]
[--ngpu NGPU] [--verbose VERBOSE]
Synthesize with fastspeech2 & parallel wavegan.
@ -149,8 +147,8 @@ optional arguments:
test metadata.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--verbose VERBOSE verbose.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose
```
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
@ -166,7 +164,7 @@ usage: multi_spk_synthesize_e2e.py [-h]
[--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT] [--text TEXT]
[--output-dir OUTPUT_DIR] [--device DEVICE]
[--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with fastspeech2 & parallel wavegan.
@ -193,7 +191,7 @@ optional arguments:
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
```
1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model.
@ -201,7 +199,7 @@ optional arguments:
3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
4. `--text` is the text file, which contains sentences to synthesize.
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
@ -226,15 +224,12 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
--fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
--fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
--speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt
```
## Future work
A multi-speaker vocoder is needed.

@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
num_workers: 2
###########################################################

@ -10,11 +10,10 @@ python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=${train_output_path}/test \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

@ -10,11 +10,10 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=${train_output_path}/test_e2e \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=2 \
--ngpu=2 \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

@ -7,7 +7,6 @@ gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz

@ -1,8 +1,8 @@
# Tacotron2 + AISHELL-3 Voice Cloning
This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e).
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs.
3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
@ -28,7 +28,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ge2e/inference.py \
--input=${input} \
--output=${preprocess_path}/embed \
--device="gpu" \
--ngpu=1 \
--checkpoint_path=${ge2e_ckpt_path}
fi
```
@ -39,9 +39,9 @@ There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is ve
We use Montreal Force Aligner 1.0. The label in aishell3 include pinyinso the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
We use [lexicon.txt](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

@ -9,10 +9,9 @@ alignment=$3
ge2e_ckpt_path=$4
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../../ge2e/inference.py \
--input=${input} \
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=${input}/wav \
--output=${preprocess_path}/embed \
--device="gpu" \
--checkpoint_path=${ge2e_ckpt_path}
fi

@ -6,4 +6,4 @@ train_output_path=$2
python3 ${BIN_DIR}/train.py \
--data=${preprocess_path} \
--output=${train_output_path} \
--device="gpu"
--ngpu=1

@ -0,0 +1,89 @@
# FastSpeech2 + AISHELL-3 Voice Cloning
This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs.
3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./alignment`.
Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. start a voice cloning inference.
```bash
./run.sh
```
### Preprocess the dataset
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
```
#### generate utterance embedding
Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
```bash
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ge2e/inference.py \
--input=${input} \
--output=${preprocess_path}/embed \
--ngpu=1 \
--checkpoint_path=${ge2e_ckpt_path}
fi
```
The computing time of utterance embedding can be x hours.
#### process wav
There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence.
We use Montreal Force Aligner 1.0. The label in aishell3 include pinyinso the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Process wav ..."
python3 ${BIN_DIR}/process_wav.py \
--input=${input}/wav \
--output=${preprocess_path}/normalized_wav \
--alignment=${alignment}
fi
```
#### preprocess transcription
We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
```bash
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/preprocess_transcription.py \
--input=${input} \
--output=${preprocess_path}
fi
```
The default input is `~/datasets/data_aishell3/train`which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
#### extract mel
```python
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/extract_mel.py \
--input=${preprocess_path}/normalized_wav \
--output=${preprocess_path}/mel
fi
```
### Train the model
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
```
Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
### Infernece
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
```
## Pretrained Model
[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).

@ -0,0 +1,105 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size.
n_shift: 300 # Hop size.
win_length: 1200 # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 2
###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 200
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086

@ -0,0 +1,86 @@
#!/bin/bash
stage=0
stop_stage=100
config_path=$1
ge2e_ckpt_path=$2
# gen speaker embedding
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=~/datasets/data_aishell3/train/wav/ \
--output=dump/embed \
--checkpoint_path=${ge2e_ckpt_path}
fi
# copy from tts3/preprocess
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# get durations from MFA's result
echo "Generate durations.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./aishell3_alignment_tone \
--output durations.txt \
--config=${config_path}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=aishell3 \
--rootdir=~/datasets/data_aishell3/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--spk_emb_dir=dump/embed
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="speech"
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="pitch"
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="energy"
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
fi

@ -0,0 +1,19 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=${train_output_path}/test \
--phones-dict=dump/phone_id_map.txt \
--voice-cloning=True

@ -0,0 +1,13 @@
#!/bin/bash
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=2 \
--phones-dict=dump/phone_id_map.txt \
--voice-cloning=True

@ -0,0 +1,22 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
ge2e_params_path=$4
ref_audio_dir=$5
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/voice_cloning.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--ge2e_params_path=${ge2e_params_path} \
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
--input-dir=${ref_audio_dir} \
--output-dir=${train_output_path}/vc_syn \
--phones-dict=dump/phone_id_map.txt

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -0,0 +1,44 @@
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz
ref_audio_dir=ref_audio
# not include ".pdparams" here
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
# include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1
fi

@ -0,0 +1,146 @@
# Parallel WaveGAN with AISHELL-3
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems.
## Dataset
### Download and Extract the datasaet
Download AISHELL-3.
```bash
wget https://www.openslr.org/resources/93/data_aishell3.tgz
```
Extract AISHELL-3.
```bash
mkdir data_aishell3
tar zxvf data_aishell3.tgz -C data_aishell3
```
### Get MFA result of AISHELL-3 and Extract it
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
```text
dump
├── dev
│ ├── norm
│ └── raw
├── test
│ ├── norm
│ └── raw
└── train
├── norm
├── raw
└── feats_stats.npy
```
The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
### Train the model
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
`./local/train.sh` calls `${BIN_DIR}/train.py`.
Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
[--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
[--profiler_options PROFILER_OPTIONS]
Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
--config CONFIG config file to overwrite default config.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
dev data.
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
benchmark:
arguments related to benchmark.
--batch-size BATCH_SIZE
batch size.
--max-iter MAX_ITER train max steps.
--run-benchmark RUN_BENCHMARK
runing benchmark or not, if True, use the --batch-size
and --max-iter.
--profiler_options PROFILER_OPTIONS
The option of profiler, which should be in format
"key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
### Synthesize
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
[--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE]
Synthesize with parallel wavegan.
optional arguments:
-h, --help show this help message and exit
--config CONFIG parallel wavegan config file.
--checkpoint CHECKPOINT
snapshot to load.
--test-metadata TEST_METADATA
dev data.
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
```
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `snapshot_iter_1000000.pdz `.
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip).
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_aishell3_ckpt_0.5
├── default.yaml # default config used to train parallel wavegan
├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.

@ -0,0 +1,115 @@
# This is the hyperparameter configuration file for Parallel WaveGAN.
# Please make sure this is adjusted for the VCTK corpus. If you want to
# apply to the other dataset, you might need to carefully change some parameters.
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # Sampling rate.
n_fft: 2048 # FFT size. (in samples)
n_shift: 300 # Hop size. (in samples)
win_length: 1200 # Window length. (in samples)
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
n_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_size: 3 # Kernel size of dilated convolution.
layers: 30 # Number of residual block layers.
stacks: 3 # Number of stacks i.e., dilation cycles.
residual_channels: 64 # Number of channels in residual conv.
gate_channels: 128 # Number of channels in gated conv.
skip_channels: 64 # Number of channels in skip conv.
aux_channels: 80 # Number of channels for auxiliary feature conv.
# Must be the same as num_mels.
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
negative_slope: 0.2 # Alpha in LeakyReLU.
###########################################################
# STFT LOSS SETTING #
###########################################################
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
lambda_adv: 4.0 # Loss balancing coefficient.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 8 # Batch size.
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params:
epsilon: 1.0e-6 # Generator's epsilon.
weight_decay: 0.0 # Generator's weight decay coefficient.
generator_scheduler_params:
learning_rate: 0.0001 # Generator's learning rate.
step_size: 200000 # Generator's scheduler step size.
gamma: 0.5 # Generator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
generator_grad_norm: 10 # Generator's gradient norm.
discriminator_optimizer_params:
epsilon: 1.0e-6 # Discriminator's epsilon.
weight_decay: 0.0 # Discriminator's weight decay coefficient.
discriminator_scheduler_params:
learning_rate: 0.00005 # Discriminator's learning rate.
step_size: 200000 # Discriminator's scheduler step size.
gamma: 0.5 # Discriminator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
discriminator_grad_norm: 1 # Discriminator's gradient norm.
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 5000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
num_snapshots: 10 # max number of snapshots to keep while training
seed: 42 # random seed for paddle, random, and np.random

@ -0,0 +1,55 @@
#!/bin/bash
stage=0
stop_stage=100
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
echo "Generate durations.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./aishell3_alignment_tone \
--output=durations.txt \
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/../preprocess.py \
--rootdir=~/datasets/data_aishell3/ \
--dataset=aishell3 \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--cut-sil=True \
--num-cpu=20
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="feats"
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--stats=dump/train/feats_stats.npy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--stats=dump/train/feats_stats.npy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--stats=dump/train/feats_stats.npy
fi

@ -0,0 +1,13 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=${train_output_path}/test

@ -0,0 +1,13 @@
#!/bin/bash
config_path=$1
train_output_path=$2
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=parallelwave_gan
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}

@ -0,0 +1,32 @@
#!/bin/bash
set -e
source path.sh
gpus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

@ -2,6 +2,7 @@
set -e
source path.sh
gpus=0,1,2,3
stage=0
stop_stage=100
conf_path=conf/conformer.yaml
@ -20,7 +21,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -30,7 +31,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=4 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then

@ -1,5 +1,5 @@
# Speedyspeech with CSMSC
This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).
# SpeedySpeech with CSMSC
This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).
## Dataset
### Download and Extract the datasaet
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
### Get MFA result of CSMSC and Extract it
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -55,10 +55,10 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
[--use-relative-path USE_RELATIVE_PATH]
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE]
[--use-relative-path USE_RELATIVE_PATH]
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
Train a Speedyspeech model with sigle speaker dataset.
@ -71,8 +71,7 @@ optional arguments:
dev data.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--nprocs NPROCS number of processes.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
--use-relative-path USE_RELATIVE_PATH
whether use relative path in metadata
@ -85,13 +84,12 @@ optional arguments:
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
7. `--tones-dict` is the path of the tone vocabulary file.
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
5. `--phones-dict` is the path of the phone vocabulary file.
6. `--tones-dict` is the path of the tone vocabulary file.
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
unzip pwg_baker_ckpt_0.4.zip
@ -115,7 +113,7 @@ usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
[--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
[--inference-dir INFERENCE_DIR] [--device DEVICE]
[--inference-dir INFERENCE_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with speedyspeech & parallel wavegan.
@ -145,7 +143,7 @@ optional arguments:
output dir
--inference-dir INFERENCE_DIR
dir to save inference models
--device DEVICE device type to use
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose
```
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
@ -161,8 +159,8 @@ usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
[--pwg-stat PWG_STAT] [--text TEXT]
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
[--output-dir OUTPUT_DIR]
[--inference-dir INFERENCE_DIR] [--device DEVICE]
[--verbose VERBOSE]
[--inference-dir INFERENCE_DIR] [--verbose VERBOSE]
[--ngpu NGPU]
Synthesize with speedyspeech & parallel wavegan.
@ -190,15 +188,15 @@ optional arguments:
output dir
--inference-dir INFERENCE_DIR
dir to save inference models
--device DEVICE device type to use
--verbose VERBOSE verbose
--ngpu NGPU if ngpu == 0, use cpu.
```
1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model.
2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for parallel wavegan, which correspond to the 3 files in the parallel wavegan pretrained model.
3. `--text` is the text file, which contains sentences to synthesize.
4. `--output-dir` is the directory to save synthesized audio files.
5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
7. `--phones-dict` is the path of the phone vocabulary file.
8. `--tones-dict` is the path of the tone vocabulary file.
@ -211,6 +209,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
## Pretrained Model
Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip).
SpeedySpeech checkpoint contains files listed below.
```text
@ -237,7 +236,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--inference-dir=exp/default/inference \
--device="gpu" \
--phones-dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
--tones-dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
```

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save