Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into ted_en_zh_t0

pull/1007/head
gongel 3 years ago
commit 5b5c73f9bb

@ -1,15 +0,0 @@
unset GREP_OPTIONS
# https://zhuanlan.zhihu.com/p/33050965
alias nvs='nvidia-smi'
alias his='history'
alias jobs='jobs -l'
alias ports='netstat -tulanp'
alias wget='wget -c'
## Colorize the grep command output for ease of use (good for log files)##
alias grep='grep --color=auto'
alias egrep='egrep --color=auto'
alias fgrep='fgrep --color=auto'

468
.vimrc

@ -1,468 +0,0 @@
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Maintainer:
" Amir Salihefendic — @amix3k
"
" Awesome_version:
" Get this config, nice color schemes and lots of plugins!
"
" Install the awesome version from:
"
" https://github.com/amix/vimrc
"
" Sections:
" -> General
" -> VIM user interface
" -> Colors and Fonts
" -> Files and backups
" -> Text, tab and indent related
" -> Visual mode related
" -> Moving around, tabs and buffers
" -> Status line
" -> Editing mappings
" -> vimgrep searching and cope displaying
" -> Spell checking
" -> Misc
" -> Helper functions
"
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => General
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Sets how many lines of history VIM has to remember
set history=500
" Enable filetype plugins
filetype plugin on
filetype indent on
" Set to auto read when a file is changed from the outside
set autoread
au FocusGained,BufEnter * checktime
" With a map leader it's possible to do extra key combinations
" like <leader>w saves the current file
let mapleader = ","
" Fast saving
nmap <leader>w :w!<cr>
" :W sudo saves the file
" (useful for handling the permission-denied error)
command! W execute 'w !sudo tee % > /dev/null' <bar> edit!
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => VIM user interface
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Set 7 lines to the cursor - when moving vertically using j/k
set so=7
" Avoid garbled characters in Chinese language windows OS
let $LANG='en'
set langmenu=en
source $VIMRUNTIME/delmenu.vim
source $VIMRUNTIME/menu.vim
" Turn on the Wild menu
set wildmenu
" Ignore compiled files
set wildignore=*.o,*~,*.pyc
if has("win16") || has("win32")
set wildignore+=.git\*,.hg\*,.svn\*
else
set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store
endif
"Always show current position
set ruler
" Height of the command bar
set cmdheight=1
" A buffer becomes hidden when it is abandoned
set hid
" Configure backspace so it acts as it should act
set backspace=eol,start,indent
set whichwrap+=<,>,h,l
" Ignore case when searching
set ignorecase
" When searching try to be smart about cases
set smartcase
" Highlight search results
set hlsearch
" Makes search act like search in modern browsers
set incsearch
" Don't redraw while executing macros (good performance config)
set lazyredraw
" For regular expressions turn magic on
set magic
" Show matching brackets when text indicator is over them
set showmatch
" How many tenths of a second to blink when matching brackets
set mat=2
" No annoying sound on errors
set noerrorbells
set novisualbell
set t_vb=
set tm=500
" Properly disable sound on errors on MacVim
if has("gui_macvim")
autocmd GUIEnter * set vb t_vb=
endif
" Add a bit extra margin to the left
set foldcolumn=1
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Colors and Fonts
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Enable syntax highlighting
syntax enable
" Enable 256 colors palette in Gnome Terminal
if $COLORTERM == 'gnome-terminal'
set t_Co=256
endif
try
colorscheme desert
catch
endtry
set background=dark
" Set extra options when running in GUI mode
if has("gui_running")
set guioptions-=T
set guioptions-=e
set t_Co=256
set guitablabel=%M\ %t
endif
" Set utf8 as standard encoding and en_US as the standard language
set encoding=utf8
set fileencodings=ucs-bom,utf-8,cp936
set fileencoding=gb2312
set termencoding=utf-8
" Use Unix as the standard file type
set ffs=unix,dos,mac
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Files, backups and undo
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Turn backup off, since most stuff is in SVN, git etc. anyway...
set nobackup
set nowb
set noswapfile
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Text, tab and indent related
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Use spaces instead of tabs
set expandtab
" Be smart when using tabs ;)
set smarttab
" 1 tab == 4 spaces
set shiftwidth=4
set tabstop=4
" Linebreak on 500 characters
set lbr
set tw=500
set ai "Auto indent
set si "Smart indent
set wrap "Wrap lines
""""""""""""""""""""""""""""""
" => Visual mode related
""""""""""""""""""""""""""""""
" Visual mode pressing * or # searches for the current selection
" Super useful! From an idea by Michael Naumann
vnoremap <silent> * :<C-u>call VisualSelection('', '')<CR>/<C-R>=@/<CR><CR>
vnoremap <silent> # :<C-u>call VisualSelection('', '')<CR>?<C-R>=@/<CR><CR>
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Moving around, tabs, windows and buffers
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Map <Space> to / (search) and Ctrl-<Space> to ? (backwards search)
map <space> /
map <C-space> ?
" Disable highlight when <leader><cr> is pressed
map <silent> <leader><cr> :noh<cr>
" Smart way to move between windows
map <C-j> <C-W>j
map <C-k> <C-W>k
map <C-h> <C-W>h
map <C-l> <C-W>l
" Close the current buffer
map <leader>bd :Bclose<cr>:tabclose<cr>gT
" Close all the buffers
map <leader>ba :bufdo bd<cr>
map <leader>l :bnext<cr>
map <leader>h :bprevious<cr>
" Useful mappings for managing tabs
map <leader>tn :tabnew<cr>
map <leader>to :tabonly<cr>
map <leader>tc :tabclose<cr>
map <leader>tm :tabmove
map <leader>t<leader> :tabnext
" Let 'tl' toggle between this and the last accessed tab
let g:lasttab = 1
nmap <Leader>tl :exe "tabn ".g:lasttab<CR>
au TabLeave * let g:lasttab = tabpagenr()
" Opens a new tab with the current buffer's path
" Super useful when editing files in the same directory
map <leader>te :tabedit <C-r>=expand("%:p:h")<cr>/
" Switch CWD to the directory of the open buffer
map <leader>cd :cd %:p:h<cr>:pwd<cr>
" Specify the behavior when switching between buffers
try
set switchbuf=useopen,usetab,newtab
set stal=2
catch
endtry
" Return to last edit position when opening files (You want this!)
au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif
""""""""""""""""""""""""""""""
" => Status line
""""""""""""""""""""""""""""""
" Always show the status line
set laststatus=2
" Format the status line
set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Editing mappings
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Remap VIM 0 to first non-blank character
map 0 ^
" Move a line of text using ALT+[jk] or Command+[jk] on mac
nmap <M-j> mz:m+<cr>`z
nmap <M-k> mz:m-2<cr>`z
vmap <M-j> :m'>+<cr>`<my`>mzgv`yo`z
vmap <M-k> :m'<-2<cr>`>my`<mzgv`yo`z
if has("mac") || has("macunix")
nmap <D-j> <M-j>
nmap <D-k> <M-k>
vmap <D-j> <M-j>
vmap <D-k> <M-k>
endif
" Delete trailing white space on save, useful for some filetypes ;)
fun! CleanExtraSpaces()
let save_cursor = getpos(".")
let old_query = getreg('/')
silent! %s/\s\+$//e
call setpos('.', save_cursor)
call setreg('/', old_query)
endfun
if has("autocmd")
autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces()
endif
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Spell checking
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Pressing ,ss will toggle and untoggle spell checking
map <leader>ss :setlocal spell!<cr>
" Shortcuts using <leader>
map <leader>sn ]s
map <leader>sp [s
map <leader>sa zg
map <leader>s? z=
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Misc
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Remove the Windows ^M - when the encodings gets messed up
noremap <Leader>m mmHmt:%s/<C-V><cr>//ge<cr>'tzt'm
" Quickly open a buffer for scribble
map <leader>q :e ~/buffer<cr>
" Quickly open a markdown buffer for scribble
map <leader>x :e ~/buffer.md<cr>
" Toggle paste mode on and off
map <leader>pp :setlocal paste!<cr>
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" => Helper functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
" Returns true if paste mode is enabled
function! HasPaste()
if &paste
return 'PASTE MODE '
endif
return ''
endfunction
" Don't close window, when deleting a buffer
command! Bclose call <SID>BufcloseCloseIt()
function! <SID>BufcloseCloseIt()
let l:currentBufNum = bufnr("%")
let l:alternateBufNum = bufnr("#")
if buflisted(l:alternateBufNum)
buffer #
else
bnext
endif
if bufnr("%") == l:currentBufNum
new
endif
if buflisted(l:currentBufNum)
execute("bdelete! ".l:currentBufNum)
endif
endfunction
function! CmdLine(str)
call feedkeys(":" . a:str)
endfunction
function! VisualSelection(direction, extra_filter) range
let l:saved_reg = @"
execute "normal! vgvy"
let l:pattern = escape(@", "\\/.*'$^~[]")
let l:pattern = substitute(l:pattern, "\n$", "", "")
if a:direction == 'gv'
call CmdLine("Ack '" . l:pattern . "' " )
elseif a:direction == 'replace'
call CmdLine("%s" . '/'. l:pattern . '/')
endif
let @/ = l:pattern
let @" = l:saved_reg
endfunction
""""""""""""""""""""""""""""""
" => Python section
""""""""""""""""""""""""""""""
let python_highlight_all = 1
au FileType python syn keyword pythonDecorator True None False self
au BufNewFile,BufRead *.jinja set syntax=htmljinja
au BufNewFile,BufRead *.mako set ft=mako
au FileType python map <buffer> F :set foldmethod=indent<cr>
au FileType python inoremap <buffer> $r return
au FileType python inoremap <buffer> $i import
au FileType python inoremap <buffer> $p print
au FileType python inoremap <buffer> $f # --- <esc>a
au FileType python map <buffer> <leader>1 /class
au FileType python map <buffer> <leader>2 /def
au FileType python map <buffer> <leader>C ?class
au FileType python map <buffer> <leader>D ?def
""""""""""""""""""""""""""""""
" => JavaScript section
"""""""""""""""""""""""""""""""
au FileType javascript call JavaScriptFold()
au FileType javascript setl fen
au FileType javascript setl nocindent
au FileType javascript imap <C-t> $log();<esc>hi
au FileType javascript imap <C-a> alert();<esc>hi
au FileType javascript inoremap <buffer> $r return
au FileType javascript inoremap <buffer> $f // --- PH<esc>FP2xi
function! JavaScriptFold()
setl foldmethod=syntax
setl foldlevelstart=1
syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend
function! FoldText()
return substitute(getline(v:foldstart), '{.*', '{...}', '')
endfunction
setl foldtext=FoldText()
endfunction
""""""""""""""""""""""""""""""
" => CoffeeScript section
"""""""""""""""""""""""""""""""
function! CoffeeScriptFold()
setl foldmethod=indent
setl foldlevelstart=1
endfunction
au FileType coffee call CoffeeScriptFold()
au FileType gitcommit call setpos('.', [0, 1, 1, 0])
""""""""""""""""""""""""""""""
" => Shell section
""""""""""""""""""""""""""""""
if exists('$TMUX')
if has('nvim')
set termguicolors
else
set term=screen-256color
endif
endif
""""""""""""""""""""""""""""""
" => Twig section
""""""""""""""""""""""""""""""
autocmd BufRead *.twig set syntax=html filetype=html
""""""""""""""""""""""""""""""
" => Markdown
""""""""""""""""""""""""""""""
let vim_markdown_folding_disabled = 1

@ -23,7 +23,7 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Speech-To-Text
:caption: Speech-to-Text
asr/models_introduction
asr/data_preparation
@ -33,7 +33,7 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Text-To-Speech
:caption: Text-to-Speech
tts/basic_usage
tts/advanced_usage

@ -1,11 +1,11 @@
# PaddleSpeech
## What is PaddleSpeech?
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
## What can PaddleSpeech do?
### Speech-To-Text
### Speech-to-Text
PaddleSpeech ASR mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including:
- attention decoding (used in Transformer and Conformer)
- attention rescoring (used in Transformer and Conformer)
Speech-To-Text helps you training the ASR model very simply.
Speech-to-Text helps you training the ASR model very simply.
### Text-To-Speech
### Text-to-Speech
TTS mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
Text-To-Speech helps you to train TTS models with simple commands.
Text-to-Speech helps you to train TTS models with simple commands.

@ -1,16 +1,17 @@
# Released Models
## Speech-To-Text Models
## Speech-to-Text Models
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
[Ds2 Online Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/s0)
[Ds2 Offline Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/s0)
[Conformer Online Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
[Conformer Offline Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
[Conformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
[Transformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
[Transformer Librispeech S2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
### Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@ -27,7 +28,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
## Text-To-Speech Models
## Text-to-Speech Models
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:

@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then

@ -24,7 +24,7 @@ Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
num_workers: 2
###########################################################

@ -7,7 +7,6 @@ gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz

@ -1,7 +1,7 @@
# Tacotron2 + AISHELL-3 Voice Cloning
This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs.
2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of Tacotron2 which will be concated with encoder outputs.
3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
## Get Started
@ -10,7 +10,7 @@ Assume the path to the MFA result of AISHELL-3 is `./alignment`.
Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. start a voice cloning inference.
```bash
@ -20,8 +20,8 @@ Run the command below to
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
```
#### generate utterance embedding
Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
#### generate speaker embedding
Use pretrained GE2E (speaker encoder) to generate speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
```bash
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
```
The computing time of utterance embedding can be x hours.
#### process wav
The computing time of utterance embedding can be x hours.
#### process wav
There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence.
We use Montreal Force Aligner 1.0. The label in aishell3 include pinyinso the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
@ -73,7 +73,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
fi
```
### Train the model
### Train the model
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
```
@ -81,7 +81,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_
Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
### Infernece
### Voice Cloning
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
```

@ -9,7 +9,7 @@ alignment=$3
ge2e_ckpt_path=$4
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../../ge2e/inference.py \
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=${input}/wav \
--output=${preprocess_path}/embed \
--checkpoint_path=${ge2e_ckpt_path}

@ -0,0 +1,129 @@
# FastSpeech2 + AISHELL-3 Voice Cloning
This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2`, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of `FastSpeech2` which will be concated with encoder outputs.
3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1).
## Dataset
### Download and Extract the datasaet
Download AISHELL-3.
```bash
wget https://www.openslr.org/resources/93/data_aishell3.tgz
```
Extract AISHELL-3.
```bash
mkdir data_aishell3
tar zxvf data_aishell3.tgz -C data_aishell3
```
### Get MFA result of AISHELL-3 and Extract it
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
## Pretrained GE2E model
We use pretrained GE2E model to generate spwaker embedding for each sentence.
Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it.
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
Assume the path to the pretrained ge2e model is `./ge2e_ckpt_0.3`.
Run the command below to
1. **source path**.
2. preprocess the dataset.
3. train the model.
4. synthesize waveform from `metadata.jsonl`.
5. start a voice cloning inference.
```bash
./run.sh
```
### Preprocess the dataset
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
```text
dump
├── dev
│ ├── norm
│ └── raw
├── embed
│ ├── SSB0005
│ ├── SSB0009
│ ├── ...
│ └── ...
├── phone_id_map.txt
├── speaker_id_map.txt
├── test
│ ├── norm
│ └── raw
└── train
├── energy_stats.npy
├── norm
├── pitch_stats.npy
├── raw
└── speech_stats.npy
```
The `embed` contains the generated speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
The computing time of utterance embedding can be x hours.
The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of which contains a `norm` and `raw` sub folder. The raw folder contains speech、pitch and energy features of each utterances, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
The preprocessing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but there is one more `ge2e/inference` step here.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
The training step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`.
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
unzip pwg_aishell3_ckpt_0.5.zip
```
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_aishell3_ckpt_0.5
├── default.yaml # default config used to train parallel wavegan
├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
```
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
The synthesizing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/synthesize.py`.
### Voice Cloning
Assume there are some reference audios in `./ref_audio`
```text
ref_audio
├── 001238.wav
├── LJ015-0254.wav
└── audio_self_test.mp3
```
`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
```
## Pretrained Model
[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
(There is no need for `speaker_id_map.txt` here )
```text
fastspeech2_nosil_aishell3_ckpt_vc1_0.5
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_96400.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```

@ -0,0 +1,105 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size.
n_shift: 300 # Hop size.
win_length: 1200 # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 2
###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 200
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086

@ -0,0 +1,86 @@
#!/bin/bash
stage=0
stop_stage=100
config_path=$1
ge2e_ckpt_path=$2
# gen speaker embedding
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=~/datasets/data_aishell3/train/wav/ \
--output=dump/embed \
--checkpoint_path=${ge2e_ckpt_path}
fi
# copy from tts3/preprocess
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# get durations from MFA's result
echo "Generate durations.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./aishell3_alignment_tone \
--output durations.txt \
--config=${config_path}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=aishell3 \
--rootdir=~/datasets/data_aishell3/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--spk_emb_dir=dump/embed
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="speech"
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="pitch"
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="energy"
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
fi

@ -0,0 +1,19 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=${train_output_path}/test \
--phones-dict=dump/phone_id_map.txt \
--voice-cloning=True

@ -0,0 +1,13 @@
#!/bin/bash
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=2 \
--phones-dict=dump/phone_id_map.txt \
--voice-cloning=True

@ -0,0 +1,22 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
ge2e_params_path=$4
ref_audio_dir=$5
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/voice_cloning.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--ge2e_params_path=${ge2e_params_path} \
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
--input-dir=${ref_audio_dir} \
--output-dir=${train_output_path}/vc_syn \
--phones-dict=dump/phone_id_map.txt

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -0,0 +1,44 @@
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz
ref_audio_dir=ref_audio
# not include ".pdparams" here
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
# include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1
fi

@ -22,7 +22,7 @@ Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -14,7 +14,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
@ -208,7 +208,8 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
```
## Pretrained Model
Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip).
Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip).
SpeedySpeech checkpoint contains files listed below.

@ -14,7 +14,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
@ -199,8 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
```
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)
Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip).
Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip).
FastSpeech2 checkpoint contains files listed below.
```text

@ -13,7 +13,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
@ -123,7 +123,8 @@ optional arguments:
## Pretrained Models
Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
Static models can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip).
Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip).
Parallel WaveGAN checkpoint contains files listed below.

@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
batch_size: 8 # Batch size.
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader.
num_workers: 2 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.

@ -13,7 +13,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
@ -106,8 +106,51 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Finetune
Since there are no `noise` in the input of Multi Band MelGAN, the audio quality is not so good (see [espnet issue](https://github.com/espnet/espnet/issues/3536#issuecomment-916035415)), we refer to the method proposed in [HiFiGAN](https://arxiv.org/abs/2010.05646), finetune Multi Band MelGAN with the predicted mel-spectrogram from `FastSpeech2`.
The length of mel-spectrograms should align with the length of wavs, so we should generate mels using ground truth alignment.
But since we are fine-tuning, we should use the statistics computed during training step.
You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
Assume the path to the dump-dir of training step is `dump`.
Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing).
Assume the path to the pretrained `FastSpeech2` model is `fastspeech2_nosil_baker_ckpt_0.4`.
\
The `finetune.sh` can
1. **source path**.
2. generate ground truth alignment mels.
3. link `*_wave.npy` from `dump` to `dump_finetune` (because we only use new mels, the wavs are the ones used during train step) .
4. copy features' stats from `dump` to `dump_finetune`.
5. normalize the ground truth alignment mels.
6. finetune the model.
Before finetune, make sure that the pretrained model is in `finetune.sh` 's `${output-dir}/checkpoints`, and there is a `records.jsonl` in it to refer to this pretrained model
```text
exp/finetune/checkpoints
├── records.jsonl
└── snapshot_iter_1000000.pdz
```
The content of `records.jsonl` should be as follows (change `"path"` to your own ckpt path):
```
{"time": "2021-11-21 15:11:20.337311", "path": "~/PaddleSpeech/examples/csmsc/voc3/exp/finetune/checkpoints/snapshot_iter_1000000.pdz", "iteration": 1000000}↩
```
Run the command below
```bash
./finetune.sh
```
By default, `finetune.sh` will use `conf/finetune.yaml` as config, the dump-dir is `dump_finetune`, the experiment dir is `exp/finetune`.
TODO:
The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models
Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip).
Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip).
Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip)
Multi Band MelGAN checkpoint contains files listed below.

@ -128,7 +128,7 @@ discriminator_scheduler_params:
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
train_max_steps: 1200000 # Number of training steps.
train_max_steps: 2000000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
@ -136,4 +136,4 @@ eval_interval_steps: 1000 # Interval steps to evaluate the network
# OTHER SETTING #
###########################################################
num_snapshots: 10 # max number of snapshots to keep while training
seed: 42 # random seed for paddle, random, and np.random
seed: 42 # random seed for paddle, random, and np.random

@ -9,20 +9,19 @@ stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--dur-file=durations.txt \
--output-dir=dump_finetune \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--dur-file=durations.txt \
--output-dir=dump_finetune \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 local/link_wav.py \
--old-dump-dir=dump \
--dump-dir=dump_finetune
python3 local/link_wav.py \
--old-dump-dir=dump \
--dump-dir=dump_finetune
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -51,13 +50,13 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} \
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ${BIN_DIR}/train.py \
--train-metadata=dump_finetune/train/norm/metadata.jsonl \
--dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
--config=conf/finetune.yaml \
--output-dir=exp/finetune \
--ngpu=1
CUDA_VISIBLE_DEVICES=${gpus} \
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ${BIN_DIR}/train.py \
--train-metadata=dump_finetune/train/norm/metadata.jsonl \
--dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
--config=conf/finetune.yaml \
--output-dir=exp/finetune \
--ngpu=1
fi

@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file

@ -1,6 +1,8 @@
# https://yaml.org/type/float.html
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:

@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# export ckpt avg_n
./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# # export ckpt avg_n
# ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/cacu_perplexity.sh || exit -1

@ -12,7 +12,7 @@ tar xjvf LJSpeech-1.1.tar.bz2
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize mels.
```bash

@ -12,7 +12,7 @@ tar xjvf LJSpeech-1.1.tar.bz2
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -14,7 +14,7 @@ Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -13,7 +13,7 @@ Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Assume the path to the Tacotron2 generated mels is `../tts0/output/test`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs from mels.
```bash

@ -12,7 +12,7 @@ Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=ge2e
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}

@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi

@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi

@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi

@ -17,7 +17,7 @@ Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
Assume the path to the MFA result of VCTK is `./vctk_alignment`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -17,7 +17,7 @@ Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
Assume the path to the MFA result of VCTK is `./vctk_alignment`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
2. preprocess the dataset.
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.

@ -126,8 +126,12 @@ decoders_module = [
]
setup(
name='swig_decoders',
version='1.1',
description="""CTC decoders""",
name='paddlespeech_ctcdecoders',
version='0.0.1a',
description="CTC decoders in paddlespeech",
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",
url="https://github.com/PaddlePaddle/PaddleSpeech",
license='Apache 2.0',
ext_modules=decoders_module,
py_modules=['swig_decoders'], )
py_modules=['swig_decoders'])

@ -39,6 +39,8 @@ if __name__ == "__main__":
"--export_path", type=str, help="path of the jit model to save")
parser.add_argument(
"--model_type", type=str, default='offline', help='offline/online')
parser.add_argument(
"--enable-auto-log", action="store_true", help="use auto log")
args = parser.parse_args()
print_arguments(args, globals())
print("model_type:{}".format(args.model_type))

@ -42,7 +42,6 @@ from paddlespeech.s2t.training.trainer import Trainer
from paddlespeech.s2t.utils import error_rate
from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils import mp_tools
from paddlespeech.s2t.utils.log import Autolog
from paddlespeech.s2t.utils.log import Log
from paddlespeech.s2t.utils.utility import UpdateConfig
@ -339,8 +338,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
error_rate_type=cfg.error_rate_type)
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
self.autolog.times.start()
self.autolog.times.stamp()
result_transcripts = self.model.decode(
audio,
audio_len,
@ -354,19 +351,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch)
self.autolog.times.stamp()
self.autolog.times.stamp()
self.autolog.times.end()
return result_transcripts
@mp_tools.rank_zero_only
@paddle.no_grad()
def test(self):
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
self.autolog = Autolog(
batch_size=self.config.decoding.batch_size,
model_name="deepspeech2",
model_precision="fp32").getlog()
self.model.eval()
cfg = self.config
error_rate_type = None
@ -390,7 +380,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg)
self.autolog.report()
@paddle.no_grad()
def export(self):
@ -414,6 +403,43 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
def __init__(self, config, args):
super().__init__(config, args)
self.apply_static = True
self.args = args
@mp_tools.rank_zero_only
@paddle.no_grad()
def test(self):
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
if self.args.enable_auto_log == True:
from paddlespeech.s2t.utils.log import Autolog
self.autolog = Autolog(
batch_size=self.config.decoding.batch_size,
model_name="deepspeech2",
model_precision="fp32").getlog()
self.model.eval()
cfg = self.config
error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0
with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader):
utts, audio, audio_len, texts, texts_len = batch
metrics = self.compute_metrics(utts, audio, audio_len, texts,
texts_len, fout)
errors_sum += metrics['errors_sum']
len_refs += metrics['len_refs']
num_ins += metrics['num_ins']
error_rate_type = metrics['error_rate_type']
logger.info("Error rate [%s] (%d/?) = %f" %
(error_rate_type, num_ins, errors_sum / len_refs))
# logging
msg = "Test: "
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg)
if self.args.enable_auto_log == True:
self.autolog.report()
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
if self.args.model_type == "online":
@ -486,8 +512,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
x_len_list = np.split(x_len_batch, batch_size, axis=0)
for x, x_len in zip(x_list, x_len_list):
self.autolog.times.start()
self.autolog.times.stamp()
if self.args.enable_auto_log == True:
self.autolog.times.start()
x_len = x_len[0]
assert (chunk_size <= x_len)
@ -521,6 +547,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
probs_chunk_list = []
probs_chunk_lens_list = []
if self.args.enable_auto_log == True:
# record the model preprocessing time
self.autolog.times.stamp()
for i in range(0, num_chunk):
start = i * chunk_stride
end = start + chunk_size
@ -576,9 +606,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
[output_probs, output_probs_padding], axis=1)
output_probs_list.append(output_probs)
output_lens_list.append(output_lens)
self.autolog.times.stamp()
self.autolog.times.stamp()
self.autolog.times.end()
if self.args.enable_auto_log == True:
# record the model inference time
self.autolog.times.stamp()
# record the post processing time
self.autolog.times.stamp()
self.autolog.times.end()
output_probs = np.concatenate(output_probs_list, axis=0)
output_lens = np.concatenate(output_lens_list, axis=0)
return output_probs, output_lens
@ -608,12 +641,17 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
audio_len_handle.reshape(x_len.shape)
audio_len_handle.copy_from_cpu(x_len)
self.autolog.times.start()
self.autolog.times.stamp()
if self.args.enable_auto_log == True:
self.autolog.times.start()
# record the prefix processing time
self.autolog.times.stamp()
self.predictor.run()
self.autolog.times.stamp()
self.autolog.times.stamp()
self.autolog.times.end()
if self.args.enable_auto_log == True:
# record the model inference time
self.autolog.times.stamp()
# record the post processing time
self.autolog.times.stamp()
self.autolog.times.end()
output_names = self.predictor.get_output_names()
output_handle = self.predictor.get_output_handle(output_names[0])
@ -624,11 +662,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
def setup_model(self):
super().setup_model()
speedyspeech_config = inference.Config(
deepspeech_config = inference.Config(
self.args.export_path + ".pdmodel",
self.args.export_path + ".pdiparams")
if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
speedyspeech_config.enable_use_gpu(100, 0)
speedyspeech_config.enable_memory_optim()
speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
self.predictor = speedyspeech_predictor
deepspeech_config.enable_use_gpu(100, 0)
deepspeech_config.enable_memory_optim()
deepspeech_predictor = inference.create_predictor(deepspeech_config)
self.predictor = deepspeech_predictor

@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
if configs['cmvn_file'] is not None:
if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
global_cmvn = GlobalCMVN(

@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
def fastspeech2_multi_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
text = [np.array(item["text"], dtype=np.int64) for item in examples]
speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
speech_lengths = [
np.array(item["speech_lengths"], dtype=np.int64) for item in examples
]
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
text = batch_sequences(text)
pitch = batch_sequences(pitch)
@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
energy = paddle.to_tensor(energy)
text_lengths = paddle.to_tensor(text_lengths)
speech_lengths = paddle.to_tensor(speech_lengths)
spk_id = paddle.to_tensor(spk_id)
batch = {
"text": text,
@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
"speech": speech,
"speech_lengths": speech_lengths,
"pitch": pitch,
"energy": energy,
"spk_id": spk_id
"energy": energy
}
# spk_emb has a higher priority than spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = batch_sequences(spk_emb)
spk_emb = paddle.to_tensor(spk_emb)
batch["spk_emb"] = spk_emb
elif "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch

@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
print("num_speakers:", num_speakers)
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(

@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
print("num_speakers:", num_speakers)
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(

@ -167,6 +167,10 @@ def main():
"pitch": str(pitch_path),
"energy": str(energy_path)
}
# add spk_emb for voice cloning
if "spk_emb" in item:
record["spk_emb"] = str(item["spk_emb"])
output_metadata.append(record)
output_metadata.sort(key=itemgetter('utt_id'))
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"

@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
cut_sil: bool=True):
cut_sil: bool=True,
spk_emb_dir: Path=None):
utt_id = fp.stem
# for vctk
if utt_id.endswith("_mic2"):
@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
"energy": str(energy_path),
"speaker": speaker
}
if spk_emb_dir:
if speaker in os.listdir(spk_emb_dir):
embed_name = utt_id + ".npy"
embed_path = spk_emb_dir / speaker / embed_name
if embed_path.is_file():
record["spk_emb"] = str(embed_path)
else:
return None
return record
@ -127,13 +136,14 @@ def process_sentences(config,
pitch_extractor=None,
energy_extractor=None,
nprocs: int=1,
cut_sil: bool=True):
cut_sil: bool=True,
spk_emb_dir: Path=None):
if nprocs == 1:
results = []
for fp in fps:
record = process_sentence(config, fp, sentences, output_dir,
mel_extractor, pitch_extractor,
energy_extractor, cut_sil)
energy_extractor, cut_sil, spk_emb_dir)
if record:
results.append(record)
else:
@ -144,7 +154,7 @@ def process_sentences(config,
future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor,
pitch_extractor, energy_extractor,
cut_sil)
cut_sil, spk_emb_dir)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
@ -202,6 +212,11 @@ def main():
default=True,
help="whether cut sil in the edge of audio")
parser.add_argument(
"--spk_emb_dir",
default=None,
type=str,
help="directory to speaker embedding files.")
args = parser.parse_args()
rootdir = Path(args.rootdir).expanduser()
@ -211,6 +226,11 @@ def main():
dumpdir.mkdir(parents=True, exist_ok=True)
dur_file = Path(args.dur_file).expanduser()
if args.spk_emb_dir:
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
else:
spk_emb_dir = None
assert rootdir.is_dir()
assert dur_file.is_file()
@ -251,6 +271,7 @@ def main():
test_wav_files += wav_files[-sub_num_dev:]
else:
train_wav_files += wav_files
elif args.dataset == "ljspeech":
wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
# split data into 3 sections
@ -317,7 +338,8 @@ def main():
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if dev_wav_files:
process_sentences(
config,
@ -327,7 +349,8 @@ def main():
mel_extractor,
pitch_extractor,
energy_extractor,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if test_wav_files:
process_sentences(
config,
@ -338,7 +361,8 @@ def main():
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if __name__ == "__main__":

@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
fields = ["utt_id", "text"]
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
num_speakers = None
print("num_speakers:", num_speakers)
print("spk_num:", spk_num)
test_dataset = DataTable(data=test_metadata, fields=fields)
@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(
@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
for datum in test_dataset:
utt_id = datum["utt_id"]
text = paddle.to_tensor(datum["text"])
if "spk_id" in datum:
spk_emb = None
spk_id = None
if args.voice_cloning and "spk_emb" in datum:
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
elif "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"])
else:
spk_id = None
with paddle.no_grad():
wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id))
wav = pwg_inference(
fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
@ -142,6 +148,15 @@ def main():
type=str,
default=None,
help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
parser.add_argument("--test-metadata", type=str, help="test metadata.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(

@ -61,18 +61,24 @@ def train_sp(args, config):
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn
num_speakers = None
print("num_speakers:", num_speakers)
print("spk_num:", spk_num)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
@ -83,17 +89,13 @@ def train_sp(args, config):
train_dataset = DataTable(
data=train_metadata,
fields=fields,
converters={"speech": np.load,
"pitch": np.load,
"energy": np.load}, )
converters=converters, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters={"speech": np.load,
"pitch": np.load,
"energy": np.load}, )
converters=converters, )
# collate function and dataloader
@ -127,10 +129,7 @@ def train_sp(args, config):
odim = config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
**config["model"])
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
@ -184,6 +183,15 @@ def main():
default=None,
help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
args = parser.parse_args()
with open(args.config) as f:

@ -0,0 +1,208 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pathlib import Path
import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
def voice_cloning(args, fastspeech2_config, pwg_config):
# speaker encoder
p = SpeakerVerificationPreprocessor(
sampling_rate=16000,
audio_norm_target_dBFS=-30,
vad_window_length=30,
vad_moving_average_width=8,
vad_max_silence_length=6,
mel_window_length=25,
mel_window_step=10,
n_mels=40,
partial_n_frames=160,
min_pad_coverage=0.75,
partial_overlap_ratio=0.5)
print("Audio Processor Done!")
speaker_encoder = LSTMSpeakerEncoder(
n_mels=40, num_layers=3, hidden_size=256, output_size=256)
speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
speaker_encoder.eval()
print("GE2E Done!")
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
vocoder = PWGGenerator(**pwg_config["generator_params"])
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
vocoder.remove_weight_norm()
vocoder.eval()
print("model done!")
frontend = Frontend(phone_vocab_path=args.phones_dict)
print("frontend done!")
stat = np.load(args.fastspeech2_stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
stat = np.load(args.pwg_stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
pwg_normalizer = ZScore(mu, std)
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
fastspeech2_inference.eval()
pwg_inference = PWGInference(pwg_normalizer, vocoder)
pwg_inference.eval()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
input_dir = Path(args.input_dir)
sentence = args.text
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
phone_ids = input_ids["phone_ids"][0]
for name in os.listdir(input_dir):
utt_id = name.split(".")[0]
ref_audio_path = input_dir / name
mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
# print("mel_sequences: ", mel_sequences.shape)
with paddle.no_grad():
spk_emb = speaker_encoder.embed_utterance(
paddle.to_tensor(mel_sequences))
# print("spk_emb shape: ", spk_emb.shape)
with paddle.no_grad():
wav = pwg_inference(
fastspeech2_inference(phone_ids, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
samplerate=fastspeech2_config.fs)
print(f"{utt_id} done!")
# Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
random_spk_emb = np.random.rand(256) * 0.2
random_spk_emb = paddle.to_tensor(random_spk_emb)
utt_id = "random_spk_emb"
with paddle.no_grad():
wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
samplerate=fastspeech2_config.fs)
print(f"{utt_id} done!")
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="")
parser.add_argument(
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
parser.add_argument(
"--fastspeech2-checkpoint",
type=str,
help="fastspeech2 checkpoint to load.")
parser.add_argument(
"--fastspeech2-stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
)
parser.add_argument(
"--pwg-config", type=str, help="parallel wavegan config file.")
parser.add_argument(
"--pwg-checkpoint",
type=str,
help="parallel wavegan generator parameters to load.")
parser.add_argument(
"--pwg-stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
)
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument(
"--text",
type=str,
default="每当你觉得,想要批评什么人的时候,你切要记着,这个世界上的人,并非都具备你禀有的条件。",
help="text to synthesize, a line")
parser.add_argument(
"--ge2e_params_path", type=str, help="ge2e params path.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
parser.add_argument(
"--input-dir",
type=str,
help="input dir of *.wav, the sample rate will be resample to 16k.")
parser.add_argument("--output-dir", type=str, help="output dir.")
args = parser.parse_args()
if args.ngpu == 0:
paddle.set_device("cpu")
elif args.ngpu > 0:
paddle.set_device("gpu")
else:
print("ngpu should >= 0 !")
with open(args.fastspeech2_config) as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
with open(args.pwg_config) as f:
pwg_config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(fastspeech2_config)
print(pwg_config)
voice_cloning(args, fastspeech2_config, pwg_config)
if __name__ == "__main__":
main()

@ -20,14 +20,14 @@ import paddle
import soundfile as sf
from matplotlib import pyplot as plt
from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.models.tacotron2 import Tacotron2
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
from paddlespeech.t2s.utils import display
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
def voice_cloning(args):

@ -98,7 +98,7 @@ class FastSpeech2(nn.Layer):
pitch_embed_dropout: float=0.5,
stop_gradient_from_pitch_predictor: bool=False,
# spk emb
num_speakers: int=None,
spk_num: int=None,
spk_embed_dim: int=None,
spk_embed_integration_type: str="add",
# tone emb
@ -148,9 +148,9 @@ class FastSpeech2(nn.Layer):
# initialize parameters
initialize(self, init_type)
if self.spk_embed_dim is not None:
if spk_num and self.spk_embed_dim:
self.spk_embedding_table = nn.Embedding(
num_embeddings=num_speakers,
num_embeddings=spk_num,
embedding_dim=self.spk_embed_dim,
padding_idx=self.padding_idx)
@ -299,7 +299,7 @@ class FastSpeech2(nn.Layer):
pitch: paddle.Tensor,
energy: paddle.Tensor,
tone_id: paddle.Tensor=None,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
spk_id: paddle.Tensor=None
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
@ -322,7 +322,7 @@ class FastSpeech2(nn.Layer):
Batch of padded token-averaged energy (B, Tmax, 1).
tone_id : Tensor, optional(int64)
Batch of padded tone ids (B, Tmax).
spembs : Tensor, optional
spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
spk_id : Tnesor, optional(int64)
Batch of speaker ids (B,)
@ -366,7 +366,7 @@ class FastSpeech2(nn.Layer):
ps,
es,
is_inference=False,
spembs=spembs,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id)
# modify mod part of groundtruth
@ -387,7 +387,7 @@ class FastSpeech2(nn.Layer):
es: paddle.Tensor=None,
is_inference: bool=False,
alpha: float=1.0,
spembs=None,
spk_emb=None,
spk_id=None,
tone_id=None) -> Sequence[paddle.Tensor]:
# forward encoder
@ -397,11 +397,12 @@ class FastSpeech2(nn.Layer):
# integrate speaker embedding
if self.spk_embed_dim is not None:
if spembs is not None:
hs = self._integrate_with_spk_embed(hs, spembs)
# spk_emb has a higher priority than spk_id
if spk_emb is not None:
hs = self._integrate_with_spk_embed(hs, spk_emb)
elif spk_id is not None:
spembs = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spembs)
spk_emb = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# integrate tone embedding
if self.tone_embed_dim is not None:
@ -489,7 +490,7 @@ class FastSpeech2(nn.Layer):
energy: paddle.Tensor=None,
alpha: float=1.0,
use_teacher_forcing: bool=False,
spembs=None,
spk_emb=None,
spk_id=None,
tone_id=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -512,7 +513,7 @@ class FastSpeech2(nn.Layer):
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
spembs : Tensor, optional
spk_emb : Tensor, optional
peaker embedding vector (spk_embed_dim,).
spk_id : Tensor, optional(int64)
Batch of padded spk ids (1,).
@ -527,7 +528,6 @@ class FastSpeech2(nn.Layer):
# input of embedding must be int64
x = paddle.cast(text, 'int64')
y = speech
spemb = spembs
d, p, e = durations, pitch, energy
# setup batch axis
ilens = paddle.shape(x)[0]
@ -537,8 +537,8 @@ class FastSpeech2(nn.Layer):
if y is not None:
ys = y.unsqueeze(0)
if spemb is not None:
spembs = spemb.unsqueeze(0)
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
if tone_id is not None:
tone_id = tone_id.unsqueeze(0)
@ -548,7 +548,7 @@ class FastSpeech2(nn.Layer):
ds = d.unsqueeze(0) if d is not None else None
ps = p.unsqueeze(0) if p is not None else None
es = e.unsqueeze(0) if e is not None else None
# ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs = self._forward(
xs,
@ -557,7 +557,7 @@ class FastSpeech2(nn.Layer):
ds=ds,
ps=ps,
es=es,
spembs=spembs,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id,
is_inference=True)
@ -569,19 +569,19 @@ class FastSpeech2(nn.Layer):
ys,
is_inference=True,
alpha=alpha,
spembs=spembs,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id)
return outs[0], d_outs[0], p_outs[0], e_outs[0]
def _integrate_with_spk_embed(self, hs, spembs):
def _integrate_with_spk_embed(self, hs, spk_emb):
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
@ -591,13 +591,13 @@ class FastSpeech2(nn.Layer):
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spembs = self.spk_projection(F.normalize(spembs))
hs = hs + spembs.unsqueeze(1)
spk_emb = self.spk_projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand(
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
shape=[-1, hs.shape[1], -1])
hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1))
hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
@ -682,9 +682,9 @@ class FastSpeech2Inference(nn.Layer):
self.normalizer = normalizer
self.acoustic_model = model
def forward(self, text, spk_id=None):
def forward(self, text, spk_id=None, spk_emb=None):
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, spk_id=spk_id)
text, spk_id=spk_id, spk_emb=spk_emb)
logmel = self.normalizer.inverse(normalized_mel)
return logmel

@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
# No explicit speaker identifier labels are used during voice cloning training.
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"],
@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id)
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs,
@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"],
@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id)
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs,

@ -391,7 +391,7 @@ class TransformerTTS(nn.Layer):
text_lengths: paddle.Tensor,
speech: paddle.Tensor,
speech_lengths: paddle.Tensor,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
@ -405,7 +405,7 @@ class TransformerTTS(nn.Layer):
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,).
spembs : Tensor, optional
spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Returns
@ -439,7 +439,7 @@ class TransformerTTS(nn.Layer):
# calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
spembs)
spk_emb)
# modifiy mod part of groundtruth
@ -467,7 +467,7 @@ class TransformerTTS(nn.Layer):
ilens: paddle.Tensor,
ys: paddle.Tensor,
olens: paddle.Tensor,
spembs: paddle.Tensor,
spk_emb: paddle.Tensor,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
# forward encoder
x_masks = self._source_mask(ilens)
@ -480,7 +480,7 @@ class TransformerTTS(nn.Layer):
# integrate speaker embedding
if self.spk_embed_dim is not None:
hs = self._integrate_with_spk_embed(hs, spembs)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if self.reduction_factor > 1:
@ -514,7 +514,7 @@ class TransformerTTS(nn.Layer):
self,
text: paddle.Tensor,
speech: paddle.Tensor=None,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
threshold: float=0.5,
minlenratio: float=0.0,
maxlenratio: float=10.0,
@ -528,7 +528,7 @@ class TransformerTTS(nn.Layer):
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
spembs : Tensor, optional
spk_emb : Tensor, optional
Speaker embedding vector (spk_embed_dim,).
threshold : float, optional
Threshold in inference.
@ -551,7 +551,6 @@ class TransformerTTS(nn.Layer):
"""
# input of embedding must be int64
y = speech
spemb = spembs
# add eos at the last of sequence
text = numpy.pad(
@ -564,12 +563,12 @@ class TransformerTTS(nn.Layer):
# get teacher forcing outputs
xs, ys = x.unsqueeze(0), y.unsqueeze(0)
spembs = None if spemb is None else spemb.unsqueeze(0)
spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
ilens = paddle.to_tensor(
[xs.shape[1]], dtype=paddle.int64, place=xs.place)
olens = paddle.to_tensor(
[ys.shape[1]], dtype=paddle.int64, place=ys.place)
outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)
# get attention weights
att_ws = []
@ -590,9 +589,9 @@ class TransformerTTS(nn.Layer):
hs = hs + style_embs.unsqueeze(1)
# integrate speaker embedding
if self.spk_embed_dim is not None:
spembs = spemb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spembs)
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# set limits of length
maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
@ -726,14 +725,14 @@ class TransformerTTS(nn.Layer):
def _integrate_with_spk_embed(self,
hs: paddle.Tensor,
spembs: paddle.Tensor) -> paddle.Tensor:
spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
@ -744,13 +743,13 @@ class TransformerTTS(nn.Layer):
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spembs = self.projection(F.normalize(spembs))
hs = hs + spembs.unsqueeze(1)
spk_emb = self.projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
-1)
hs = self.projection(paddle.concat([hs, spembs], axis=-1))
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
-1)
hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -19,7 +19,7 @@ from typing import List
import numpy as np
from tqdm import tqdm
from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):

@ -18,9 +18,9 @@ import numpy as np
import paddle
import tqdm
from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
def embed_utterance(processor, model, fpath_or_wav):

@ -14,14 +14,13 @@
import argparse
from pathlib import Path
from audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
from paddlespeech.t2s.exps.ge2e.dataset_processors import process_aidatatang_200zh
from paddlespeech.t2s.exps.ge2e.dataset_processors import process_librispeech
from paddlespeech.t2s.exps.ge2e.dataset_processors import process_magicdata
from paddlespeech.t2s.exps.ge2e.dataset_processors import process_voxceleb1
from paddlespeech.t2s.exps.ge2e.dataset_processors import process_voxceleb2
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults
from paddlespeech.vector.exps.ge2e.dataset_processors import process_aidatatang_200zh
from paddlespeech.vector.exps.ge2e.dataset_processors import process_librispeech
from paddlespeech.vector.exps.ge2e.dataset_processors import process_magicdata
from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb1
from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb2
if __name__ == "__main__":
parser = argparse.ArgumentParser(

@ -18,7 +18,7 @@ import numpy as np
from paddle.io import BatchSampler
from paddle.io import Dataset
from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
from paddlespeech.vector.exps.ge2e.random_cycle import random_cycle
class MultiSpeakerMelDataset(Dataset):

@ -19,13 +19,13 @@ from paddle.io import DataLoader
from paddle.nn.clip import ClipGradByGlobalNorm
from paddle.optimizer import Adam
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.training import default_argument_parser
from paddlespeech.t2s.training import ExperimentBase
from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults
from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import Collate
from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
class Ge2eExperiment(ExperimentBase):

@ -0,0 +1,13 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -1,9 +1,11 @@
ConfigArgParse
coverage
distro
editdistance
g2p_en
g2pM
gpustat
GPUtil
h5py
inflect
jieba
@ -16,20 +18,24 @@ matplotlib
nara_wpe
nltk
numba
numpy==1.20.0
paddlespeech_ctcdecoders
paddlespeech_feat
pandas
phkit
Pillow
praatio~=4.1
pre-commit
psutil
pybind11
pynvml
pypi-kenlm
pypinyin
python-dateutil
pyworld
resampy==0.2.2
sacrebleu
scipy
sentencepiece
sentencepiece~=0.1.96
snakeviz
soundfile~=0.10
sox
@ -40,13 +46,7 @@ timer
tqdm
typeguard
unidecode
visualdl==2.2.0
visualdl
webrtcvad
yacs
yq
pypi-kenlm
GPUtil
psutil
pynvml
distro

@ -78,7 +78,6 @@ def _post_install(install_lib_dir):
print(os.getcwd())
check_call("./install_autolog.sh")
print("autolog install.")
# ctcdecoder
ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig'
with pushd(ctcdecoder_dir):
@ -102,11 +101,9 @@ class DevelopCommand(develop):
class InstallCommand(install):
def run(self):
install.run(self)
# must after install.run, or pkg install by shell will not see
self.execute(_post_install, (self.install_lib, ), msg="Post Install...")
# cmd: python setup.py upload
# cmd: python setup.py upload
class UploadCommand(Command):
description = "Build and publish the package."
user_options = []
@ -133,11 +130,11 @@ class UploadCommand(Command):
setup_info = dict(
# Metadata
name='paddlespeech',
version='2.1.2',
author='PaddleSL Speech Team',
author_email='',
url='https://github.com/PaddlePaddle/DeepSpeech',
license='Apache 2',
version='0.0.1a',
author='PaddlePaddle Speech and Language Team',
author_email='paddlesl@baidu.com',
url='https://github.com/PaddlePaddle/PaddleSpeech',
license='Apache 2.0',
description='Speech tools and models based on Paddlepaddle',
long_description=read("README.md"),
long_description_content_type="text/markdown",
@ -175,11 +172,11 @@ setup_info = dict(
},
# Package info
packages=find_packages(exclude=('tests', 'tests.*', 'examples*',
packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*',
'paddleaudio*', 'third_party*', 'tools*')),
zip_safe=True,
classifiers=[
'Development Status :: 4 - Beta',
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
@ -189,6 +186,7 @@ setup_info = dict(
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
], )
setup(**setup_info)

@ -3,12 +3,16 @@ try:
except ImportError:
from distutils.core import setup
setup(name='python_speech_features',
version='0.6',
description='Python Speech Feature extraction',
author='James Lyons',
author_email='james.lyons0@gmail.com',
with open("requirements.txt", encoding="utf-8-sig") as f:
requirements = f.readlines()
setup(name='paddlespeech_feat',
version='0.0.1a',
description='python speech feature extraction in paddlespeech',
install_requires=requirements,
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",
license='MIT',
url='https://github.com/jameslyons/python_speech_features',
url='https://github.com/PaddlePaddle/PaddleSpeech',
packages=['python_speech_features'],
)

Loading…
Cancel
Save