Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into ted_en_zh_t0
commit
5b5c73f9bb
@ -1,15 +0,0 @@
|
||||
unset GREP_OPTIONS
|
||||
|
||||
# https://zhuanlan.zhihu.com/p/33050965
|
||||
alias nvs='nvidia-smi'
|
||||
alias his='history'
|
||||
alias jobs='jobs -l'
|
||||
alias ports='netstat -tulanp'
|
||||
alias wget='wget -c'
|
||||
|
||||
## Colorize the grep command output for ease of use (good for log files)##
|
||||
alias grep='grep --color=auto'
|
||||
alias egrep='egrep --color=auto'
|
||||
alias fgrep='fgrep --color=auto'
|
||||
|
||||
|
@ -1,468 +0,0 @@
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Maintainer:
|
||||
" Amir Salihefendic — @amix3k
|
||||
"
|
||||
" Awesome_version:
|
||||
" Get this config, nice color schemes and lots of plugins!
|
||||
"
|
||||
" Install the awesome version from:
|
||||
"
|
||||
" https://github.com/amix/vimrc
|
||||
"
|
||||
" Sections:
|
||||
" -> General
|
||||
" -> VIM user interface
|
||||
" -> Colors and Fonts
|
||||
" -> Files and backups
|
||||
" -> Text, tab and indent related
|
||||
" -> Visual mode related
|
||||
" -> Moving around, tabs and buffers
|
||||
" -> Status line
|
||||
" -> Editing mappings
|
||||
" -> vimgrep searching and cope displaying
|
||||
" -> Spell checking
|
||||
" -> Misc
|
||||
" -> Helper functions
|
||||
"
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => General
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Sets how many lines of history VIM has to remember
|
||||
set history=500
|
||||
|
||||
" Enable filetype plugins
|
||||
filetype plugin on
|
||||
filetype indent on
|
||||
|
||||
" Set to auto read when a file is changed from the outside
|
||||
set autoread
|
||||
au FocusGained,BufEnter * checktime
|
||||
|
||||
" With a map leader it's possible to do extra key combinations
|
||||
" like <leader>w saves the current file
|
||||
let mapleader = ","
|
||||
|
||||
" Fast saving
|
||||
nmap <leader>w :w!<cr>
|
||||
|
||||
" :W sudo saves the file
|
||||
" (useful for handling the permission-denied error)
|
||||
command! W execute 'w !sudo tee % > /dev/null' <bar> edit!
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => VIM user interface
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Set 7 lines to the cursor - when moving vertically using j/k
|
||||
set so=7
|
||||
|
||||
" Avoid garbled characters in Chinese language windows OS
|
||||
let $LANG='en'
|
||||
set langmenu=en
|
||||
source $VIMRUNTIME/delmenu.vim
|
||||
source $VIMRUNTIME/menu.vim
|
||||
|
||||
" Turn on the Wild menu
|
||||
set wildmenu
|
||||
|
||||
" Ignore compiled files
|
||||
set wildignore=*.o,*~,*.pyc
|
||||
if has("win16") || has("win32")
|
||||
set wildignore+=.git\*,.hg\*,.svn\*
|
||||
else
|
||||
set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store
|
||||
endif
|
||||
|
||||
"Always show current position
|
||||
set ruler
|
||||
|
||||
" Height of the command bar
|
||||
set cmdheight=1
|
||||
|
||||
" A buffer becomes hidden when it is abandoned
|
||||
set hid
|
||||
|
||||
" Configure backspace so it acts as it should act
|
||||
set backspace=eol,start,indent
|
||||
set whichwrap+=<,>,h,l
|
||||
|
||||
" Ignore case when searching
|
||||
set ignorecase
|
||||
|
||||
" When searching try to be smart about cases
|
||||
set smartcase
|
||||
|
||||
" Highlight search results
|
||||
set hlsearch
|
||||
|
||||
" Makes search act like search in modern browsers
|
||||
set incsearch
|
||||
|
||||
" Don't redraw while executing macros (good performance config)
|
||||
set lazyredraw
|
||||
|
||||
" For regular expressions turn magic on
|
||||
set magic
|
||||
|
||||
" Show matching brackets when text indicator is over them
|
||||
set showmatch
|
||||
" How many tenths of a second to blink when matching brackets
|
||||
set mat=2
|
||||
|
||||
" No annoying sound on errors
|
||||
set noerrorbells
|
||||
set novisualbell
|
||||
set t_vb=
|
||||
set tm=500
|
||||
|
||||
" Properly disable sound on errors on MacVim
|
||||
if has("gui_macvim")
|
||||
autocmd GUIEnter * set vb t_vb=
|
||||
endif
|
||||
|
||||
|
||||
" Add a bit extra margin to the left
|
||||
set foldcolumn=1
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Colors and Fonts
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Enable syntax highlighting
|
||||
syntax enable
|
||||
|
||||
" Enable 256 colors palette in Gnome Terminal
|
||||
if $COLORTERM == 'gnome-terminal'
|
||||
set t_Co=256
|
||||
endif
|
||||
|
||||
try
|
||||
colorscheme desert
|
||||
catch
|
||||
endtry
|
||||
|
||||
set background=dark
|
||||
|
||||
" Set extra options when running in GUI mode
|
||||
if has("gui_running")
|
||||
set guioptions-=T
|
||||
set guioptions-=e
|
||||
set t_Co=256
|
||||
set guitablabel=%M\ %t
|
||||
endif
|
||||
|
||||
" Set utf8 as standard encoding and en_US as the standard language
|
||||
set encoding=utf8
|
||||
set fileencodings=ucs-bom,utf-8,cp936
|
||||
set fileencoding=gb2312
|
||||
set termencoding=utf-8
|
||||
|
||||
" Use Unix as the standard file type
|
||||
set ffs=unix,dos,mac
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Files, backups and undo
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Turn backup off, since most stuff is in SVN, git etc. anyway...
|
||||
set nobackup
|
||||
set nowb
|
||||
set noswapfile
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Text, tab and indent related
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Use spaces instead of tabs
|
||||
set expandtab
|
||||
|
||||
" Be smart when using tabs ;)
|
||||
set smarttab
|
||||
|
||||
" 1 tab == 4 spaces
|
||||
set shiftwidth=4
|
||||
set tabstop=4
|
||||
|
||||
" Linebreak on 500 characters
|
||||
set lbr
|
||||
set tw=500
|
||||
|
||||
set ai "Auto indent
|
||||
set si "Smart indent
|
||||
set wrap "Wrap lines
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => Visual mode related
|
||||
""""""""""""""""""""""""""""""
|
||||
" Visual mode pressing * or # searches for the current selection
|
||||
" Super useful! From an idea by Michael Naumann
|
||||
vnoremap <silent> * :<C-u>call VisualSelection('', '')<CR>/<C-R>=@/<CR><CR>
|
||||
vnoremap <silent> # :<C-u>call VisualSelection('', '')<CR>?<C-R>=@/<CR><CR>
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Moving around, tabs, windows and buffers
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Map <Space> to / (search) and Ctrl-<Space> to ? (backwards search)
|
||||
map <space> /
|
||||
map <C-space> ?
|
||||
|
||||
" Disable highlight when <leader><cr> is pressed
|
||||
map <silent> <leader><cr> :noh<cr>
|
||||
|
||||
" Smart way to move between windows
|
||||
map <C-j> <C-W>j
|
||||
map <C-k> <C-W>k
|
||||
map <C-h> <C-W>h
|
||||
map <C-l> <C-W>l
|
||||
|
||||
" Close the current buffer
|
||||
map <leader>bd :Bclose<cr>:tabclose<cr>gT
|
||||
|
||||
" Close all the buffers
|
||||
map <leader>ba :bufdo bd<cr>
|
||||
|
||||
map <leader>l :bnext<cr>
|
||||
map <leader>h :bprevious<cr>
|
||||
|
||||
" Useful mappings for managing tabs
|
||||
map <leader>tn :tabnew<cr>
|
||||
map <leader>to :tabonly<cr>
|
||||
map <leader>tc :tabclose<cr>
|
||||
map <leader>tm :tabmove
|
||||
map <leader>t<leader> :tabnext
|
||||
|
||||
" Let 'tl' toggle between this and the last accessed tab
|
||||
let g:lasttab = 1
|
||||
nmap <Leader>tl :exe "tabn ".g:lasttab<CR>
|
||||
au TabLeave * let g:lasttab = tabpagenr()
|
||||
|
||||
|
||||
" Opens a new tab with the current buffer's path
|
||||
" Super useful when editing files in the same directory
|
||||
map <leader>te :tabedit <C-r>=expand("%:p:h")<cr>/
|
||||
|
||||
" Switch CWD to the directory of the open buffer
|
||||
map <leader>cd :cd %:p:h<cr>:pwd<cr>
|
||||
|
||||
" Specify the behavior when switching between buffers
|
||||
try
|
||||
set switchbuf=useopen,usetab,newtab
|
||||
set stal=2
|
||||
catch
|
||||
endtry
|
||||
|
||||
" Return to last edit position when opening files (You want this!)
|
||||
au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => Status line
|
||||
""""""""""""""""""""""""""""""
|
||||
" Always show the status line
|
||||
set laststatus=2
|
||||
|
||||
" Format the status line
|
||||
set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Editing mappings
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Remap VIM 0 to first non-blank character
|
||||
map 0 ^
|
||||
|
||||
" Move a line of text using ALT+[jk] or Command+[jk] on mac
|
||||
nmap <M-j> mz:m+<cr>`z
|
||||
nmap <M-k> mz:m-2<cr>`z
|
||||
vmap <M-j> :m'>+<cr>`<my`>mzgv`yo`z
|
||||
vmap <M-k> :m'<-2<cr>`>my`<mzgv`yo`z
|
||||
|
||||
if has("mac") || has("macunix")
|
||||
nmap <D-j> <M-j>
|
||||
nmap <D-k> <M-k>
|
||||
vmap <D-j> <M-j>
|
||||
vmap <D-k> <M-k>
|
||||
endif
|
||||
|
||||
" Delete trailing white space on save, useful for some filetypes ;)
|
||||
fun! CleanExtraSpaces()
|
||||
let save_cursor = getpos(".")
|
||||
let old_query = getreg('/')
|
||||
silent! %s/\s\+$//e
|
||||
call setpos('.', save_cursor)
|
||||
call setreg('/', old_query)
|
||||
endfun
|
||||
|
||||
if has("autocmd")
|
||||
autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces()
|
||||
endif
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Spell checking
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Pressing ,ss will toggle and untoggle spell checking
|
||||
map <leader>ss :setlocal spell!<cr>
|
||||
|
||||
" Shortcuts using <leader>
|
||||
map <leader>sn ]s
|
||||
map <leader>sp [s
|
||||
map <leader>sa zg
|
||||
map <leader>s? z=
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Misc
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Remove the Windows ^M - when the encodings gets messed up
|
||||
noremap <Leader>m mmHmt:%s/<C-V><cr>//ge<cr>'tzt'm
|
||||
|
||||
" Quickly open a buffer for scribble
|
||||
map <leader>q :e ~/buffer<cr>
|
||||
|
||||
" Quickly open a markdown buffer for scribble
|
||||
map <leader>x :e ~/buffer.md<cr>
|
||||
|
||||
" Toggle paste mode on and off
|
||||
map <leader>pp :setlocal paste!<cr>
|
||||
|
||||
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" => Helper functions
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
" Returns true if paste mode is enabled
|
||||
function! HasPaste()
|
||||
if &paste
|
||||
return 'PASTE MODE '
|
||||
endif
|
||||
return ''
|
||||
endfunction
|
||||
|
||||
" Don't close window, when deleting a buffer
|
||||
command! Bclose call <SID>BufcloseCloseIt()
|
||||
function! <SID>BufcloseCloseIt()
|
||||
let l:currentBufNum = bufnr("%")
|
||||
let l:alternateBufNum = bufnr("#")
|
||||
|
||||
if buflisted(l:alternateBufNum)
|
||||
buffer #
|
||||
else
|
||||
bnext
|
||||
endif
|
||||
|
||||
if bufnr("%") == l:currentBufNum
|
||||
new
|
||||
endif
|
||||
|
||||
if buflisted(l:currentBufNum)
|
||||
execute("bdelete! ".l:currentBufNum)
|
||||
endif
|
||||
endfunction
|
||||
|
||||
function! CmdLine(str)
|
||||
call feedkeys(":" . a:str)
|
||||
endfunction
|
||||
|
||||
function! VisualSelection(direction, extra_filter) range
|
||||
let l:saved_reg = @"
|
||||
execute "normal! vgvy"
|
||||
|
||||
let l:pattern = escape(@", "\\/.*'$^~[]")
|
||||
let l:pattern = substitute(l:pattern, "\n$", "", "")
|
||||
|
||||
if a:direction == 'gv'
|
||||
call CmdLine("Ack '" . l:pattern . "' " )
|
||||
elseif a:direction == 'replace'
|
||||
call CmdLine("%s" . '/'. l:pattern . '/')
|
||||
endif
|
||||
|
||||
let @/ = l:pattern
|
||||
let @" = l:saved_reg
|
||||
endfunction
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => Python section
|
||||
""""""""""""""""""""""""""""""
|
||||
let python_highlight_all = 1
|
||||
au FileType python syn keyword pythonDecorator True None False self
|
||||
|
||||
au BufNewFile,BufRead *.jinja set syntax=htmljinja
|
||||
au BufNewFile,BufRead *.mako set ft=mako
|
||||
|
||||
au FileType python map <buffer> F :set foldmethod=indent<cr>
|
||||
|
||||
au FileType python inoremap <buffer> $r return
|
||||
au FileType python inoremap <buffer> $i import
|
||||
au FileType python inoremap <buffer> $p print
|
||||
au FileType python inoremap <buffer> $f # --- <esc>a
|
||||
au FileType python map <buffer> <leader>1 /class
|
||||
au FileType python map <buffer> <leader>2 /def
|
||||
au FileType python map <buffer> <leader>C ?class
|
||||
au FileType python map <buffer> <leader>D ?def
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => JavaScript section
|
||||
"""""""""""""""""""""""""""""""
|
||||
au FileType javascript call JavaScriptFold()
|
||||
au FileType javascript setl fen
|
||||
au FileType javascript setl nocindent
|
||||
|
||||
au FileType javascript imap <C-t> $log();<esc>hi
|
||||
au FileType javascript imap <C-a> alert();<esc>hi
|
||||
|
||||
au FileType javascript inoremap <buffer> $r return
|
||||
au FileType javascript inoremap <buffer> $f // --- PH<esc>FP2xi
|
||||
|
||||
function! JavaScriptFold()
|
||||
setl foldmethod=syntax
|
||||
setl foldlevelstart=1
|
||||
syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend
|
||||
|
||||
function! FoldText()
|
||||
return substitute(getline(v:foldstart), '{.*', '{...}', '')
|
||||
endfunction
|
||||
setl foldtext=FoldText()
|
||||
endfunction
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => CoffeeScript section
|
||||
"""""""""""""""""""""""""""""""
|
||||
function! CoffeeScriptFold()
|
||||
setl foldmethod=indent
|
||||
setl foldlevelstart=1
|
||||
endfunction
|
||||
au FileType coffee call CoffeeScriptFold()
|
||||
|
||||
au FileType gitcommit call setpos('.', [0, 1, 1, 0])
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => Shell section
|
||||
""""""""""""""""""""""""""""""
|
||||
if exists('$TMUX')
|
||||
if has('nvim')
|
||||
set termguicolors
|
||||
else
|
||||
set term=screen-256color
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => Twig section
|
||||
""""""""""""""""""""""""""""""
|
||||
autocmd BufRead *.twig set syntax=html filetype=html
|
||||
|
||||
|
||||
""""""""""""""""""""""""""""""
|
||||
" => Markdown
|
||||
""""""""""""""""""""""""""""""
|
||||
let vim_markdown_folding_disabled = 1
|
@ -0,0 +1,129 @@
|
||||
# FastSpeech2 + AISHELL-3 Voice Cloning
|
||||
This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
|
||||
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2`, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
|
||||
2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of `FastSpeech2` which will be concated with encoder outputs.
|
||||
3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1).
|
||||
|
||||
## Dataset
|
||||
### Download and Extract the datasaet
|
||||
Download AISHELL-3.
|
||||
```bash
|
||||
wget https://www.openslr.org/resources/93/data_aishell3.tgz
|
||||
```
|
||||
Extract AISHELL-3.
|
||||
```bash
|
||||
mkdir data_aishell3
|
||||
tar zxvf data_aishell3.tgz -C data_aishell3
|
||||
```
|
||||
### Get MFA result of AISHELL-3 and Extract it
|
||||
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
|
||||
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
|
||||
|
||||
## Pretrained GE2E model
|
||||
We use pretrained GE2E model to generate spwaker embedding for each sentence.
|
||||
|
||||
Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/data_aishell3`.
|
||||
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
|
||||
Assume the path to the pretrained ge2e model is `./ge2e_ckpt_0.3`.
|
||||
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize waveform from `metadata.jsonl`.
|
||||
5. start a voice cloning inference.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
### Preprocess the dataset
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── embed
|
||||
│ ├── SSB0005
|
||||
│ ├── SSB0009
|
||||
│ ├── ...
|
||||
│ └── ...
|
||||
├── phone_id_map.txt
|
||||
├── speaker_id_map.txt
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── energy_stats.npy
|
||||
├── norm
|
||||
├── pitch_stats.npy
|
||||
├── raw
|
||||
└── speech_stats.npy
|
||||
```
|
||||
The `embed` contains the generated speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
|
||||
|
||||
The computing time of utterance embedding can be x hours.
|
||||
|
||||
The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of which contains a `norm` and `raw` sub folder. The raw folder contains speech、pitch and energy features of each utterances, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
|
||||
|
||||
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
|
||||
|
||||
The preprocessing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but there is one more `ge2e/inference` step here.
|
||||
|
||||
### Train the model
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
The training step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`.
|
||||
|
||||
### Synthesize
|
||||
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
|
||||
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
|
||||
```bash
|
||||
unzip pwg_aishell3_ckpt_0.5.zip
|
||||
```
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
```text
|
||||
pwg_aishell3_ckpt_0.5
|
||||
├── default.yaml # default config used to train parallel wavegan
|
||||
├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
|
||||
```
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
The synthesizing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/synthesize.py`.
|
||||
|
||||
### Voice Cloning
|
||||
Assume there are some reference audios in `./ref_audio`
|
||||
```text
|
||||
ref_audio
|
||||
├── 001238.wav
|
||||
├── LJ015-0254.wav
|
||||
└── audio_self_test.mp3
|
||||
```
|
||||
`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
|
||||
```
|
||||
## Pretrained Model
|
||||
[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
|
||||
|
||||
FastSpeech2 checkpoint contains files listed below.
|
||||
(There is no need for `speaker_id_map.txt` here )
|
||||
|
||||
```text
|
||||
fastspeech2_nosil_aishell3_ckpt_vc1_0.5
|
||||
├── default.yaml # default config used to train fastspeech2
|
||||
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
|
||||
├── snapshot_iter_96400.pdz # model parameters and optimizer states
|
||||
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
|
||||
```
|
@ -0,0 +1,105 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size.
|
||||
n_shift: 300 # Hop size.
|
||||
win_length: 1200 # Window length.
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Maximum f0 for pitch extraction.
|
||||
f0max: 400 # Minimum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
adim: 384 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1536 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1536 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
postnet_layers: 5 # number of layers of postnset
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
init_type: xavier_uniform # initialization type
|
||||
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
||||
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
||||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||
spk_embed_dim: 256 # speaker embedding dimension
|
||||
spk_embed_integration_type: concat # speaker embedding integration type
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 200
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
ge2e_ckpt_path=$2
|
||||
|
||||
# gen speaker embedding
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
|
||||
--input=~/datasets/data_aishell3/train/wav/ \
|
||||
--output=dump/embed \
|
||||
--checkpoint_path=${ge2e_ckpt_path}
|
||||
fi
|
||||
|
||||
# copy from tts3/preprocess
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=aishell3 \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True \
|
||||
--spk_emb_dir=dump/embed
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="pitch"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="energy"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# normalize and covert phone/speaker to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--voice-cloning=True
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--voice-cloning=True
|
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
ge2e_params_path=$4
|
||||
ref_audio_dir=$5
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/voice_cloning.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--ge2e_params_path=${ge2e_params_path} \
|
||||
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
|
||||
--input-dir=${ref_audio_dir} \
|
||||
--output-dir=${train_output_path}/vc_syn \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=fastspeech2
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_482.pdz
|
||||
ref_audio_dir=ref_audio
|
||||
|
||||
# not include ".pdparams" here
|
||||
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
|
||||
|
||||
# include ".pdparams" here
|
||||
ge2e_params_path=${ge2e_ckpt_path}.pdparams
|
||||
|
||||
# with the following command, you can choice the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1
|
||||
fi
|
@ -0,0 +1,208 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.t2s.frontend.zh_frontend import Frontend
|
||||
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
|
||||
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
|
||||
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
|
||||
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
|
||||
from paddlespeech.t2s.modules.normalizer import ZScore
|
||||
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
|
||||
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
|
||||
|
||||
|
||||
def voice_cloning(args, fastspeech2_config, pwg_config):
|
||||
# speaker encoder
|
||||
p = SpeakerVerificationPreprocessor(
|
||||
sampling_rate=16000,
|
||||
audio_norm_target_dBFS=-30,
|
||||
vad_window_length=30,
|
||||
vad_moving_average_width=8,
|
||||
vad_max_silence_length=6,
|
||||
mel_window_length=25,
|
||||
mel_window_step=10,
|
||||
n_mels=40,
|
||||
partial_n_frames=160,
|
||||
min_pad_coverage=0.75,
|
||||
partial_overlap_ratio=0.5)
|
||||
print("Audio Processor Done!")
|
||||
|
||||
speaker_encoder = LSTMSpeakerEncoder(
|
||||
n_mels=40, num_layers=3, hidden_size=256, output_size=256)
|
||||
speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
|
||||
speaker_encoder.eval()
|
||||
print("GE2E Done!")
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
odim = fastspeech2_config.n_mels
|
||||
model = FastSpeech2(
|
||||
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
|
||||
|
||||
model.set_state_dict(
|
||||
paddle.load(args.fastspeech2_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
frontend = Frontend(phone_vocab_path=args.phones_dict)
|
||||
print("frontend done!")
|
||||
|
||||
stat = np.load(args.fastspeech2_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
fastspeech2_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
fastspeech2_inference.eval()
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
|
||||
sentence = args.text
|
||||
|
||||
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
|
||||
phone_ids = input_ids["phone_ids"][0]
|
||||
|
||||
for name in os.listdir(input_dir):
|
||||
utt_id = name.split(".")[0]
|
||||
ref_audio_path = input_dir / name
|
||||
mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
|
||||
# print("mel_sequences: ", mel_sequences.shape)
|
||||
with paddle.no_grad():
|
||||
spk_emb = speaker_encoder.embed_utterance(
|
||||
paddle.to_tensor(mel_sequences))
|
||||
# print("spk_emb shape: ", spk_emb.shape)
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = pwg_inference(
|
||||
fastspeech2_inference(phone_ids, spk_emb=spk_emb))
|
||||
|
||||
sf.write(
|
||||
str(output_dir / (utt_id + ".wav")),
|
||||
wav.numpy(),
|
||||
samplerate=fastspeech2_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
# Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
|
||||
random_spk_emb = np.random.rand(256) * 0.2
|
||||
random_spk_emb = paddle.to_tensor(random_spk_emb)
|
||||
utt_id = "random_spk_emb"
|
||||
with paddle.no_grad():
|
||||
wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb))
|
||||
sf.write(
|
||||
str(output_dir / (utt_id + ".wav")),
|
||||
wav.numpy(),
|
||||
samplerate=fastspeech2_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
help="fastspeech2 checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="parallel wavegan config file.")
|
||||
parser.add_argument(
|
||||
"--pwg-checkpoint",
|
||||
type=str,
|
||||
help="parallel wavegan generator parameters to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
default="每当你觉得,想要批评什么人的时候,你切要记着,这个世界上的人,并非都具备你禀有的条件。",
|
||||
help="text to synthesize, a line")
|
||||
|
||||
parser.add_argument(
|
||||
"--ge2e_params_path", type=str, help="ge2e params path.")
|
||||
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
|
||||
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
type=str,
|
||||
help="input dir of *.wav, the sample rate will be resample to 16k.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
elif args.ngpu > 0:
|
||||
paddle.set_device("gpu")
|
||||
else:
|
||||
print("ngpu should >= 0 !")
|
||||
|
||||
with open(args.fastspeech2_config) as f:
|
||||
fastspeech2_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(fastspeech2_config)
|
||||
print(pwg_config)
|
||||
|
||||
voice_cloning(args, fastspeech2_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
Loading…
Reference in new issue