diff --git a/.bashrc b/.bashrc deleted file mode 100644 index 8abbb3c7..00000000 --- a/.bashrc +++ /dev/null @@ -1,15 +0,0 @@ -unset GREP_OPTIONS - -# https://zhuanlan.zhihu.com/p/33050965 -alias nvs='nvidia-smi' -alias his='history' -alias jobs='jobs -l' -alias ports='netstat -tulanp' -alias wget='wget -c' - -## Colorize the grep command output for ease of use (good for log files)## -alias grep='grep --color=auto' -alias egrep='egrep --color=auto' -alias fgrep='fgrep --color=auto' - - diff --git a/.mergify.yml b/.mergify.yml index 1e306d67..951f05c4 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -53,7 +53,7 @@ pull_request_rules: add: ["T2S"] - name: "auto add label=Audio" conditions: - - files~=^paddleaudio/ + - files~=^audio/ actions: label: add: ["Audio"] @@ -69,6 +69,12 @@ pull_request_rules: actions: label: add: ["Example"] + - name: "auto add label=Demo" + conditions: + - files~=^demos/ + actions: + label: + add: ["Demo"] - name: "auto add label=README" conditions: - files~=README.md @@ -77,7 +83,7 @@ pull_request_rules: add: ["README"] - name: "auto add label=Documentation" conditions: - - files~=^doc/ + - files~=^docs/ actions: label: add: ["Documentation"] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b3fc1c77..2f80e46b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,8 +13,8 @@ files: (?!.*paddle)^.*$ - id: end-of-file-fixer files: \.md$ - - id: trailing-whitespace - files: \.md$ + #- id: trailing-whitespace + # files: \.md$ - id: requirements-txt-fixer exclude: (?=third_party).*$ - id: check-yaml diff --git a/.vimrc b/.vimrc deleted file mode 100644 index fbb070d6..00000000 --- a/.vimrc +++ /dev/null @@ -1,468 +0,0 @@ -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Maintainer: -" Amir Salihefendic — @amix3k -" -" Awesome_version: -" Get this config, nice color schemes and lots of plugins! -" -" Install the awesome version from: -" -" https://github.com/amix/vimrc -" -" Sections: -" -> General -" -> VIM user interface -" -> Colors and Fonts -" -> Files and backups -" -> Text, tab and indent related -" -> Visual mode related -" -> Moving around, tabs and buffers -" -> Status line -" -> Editing mappings -" -> vimgrep searching and cope displaying -" -> Spell checking -" -> Misc -" -> Helper functions -" -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => General -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Sets how many lines of history VIM has to remember -set history=500 - -" Enable filetype plugins -filetype plugin on -filetype indent on - -" Set to auto read when a file is changed from the outside -set autoread -au FocusGained,BufEnter * checktime - -" With a map leader it's possible to do extra key combinations -" like w saves the current file -let mapleader = "," - -" Fast saving -nmap w :w! - -" :W sudo saves the file -" (useful for handling the permission-denied error) -command! W execute 'w !sudo tee % > /dev/null' edit! - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => VIM user interface -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Set 7 lines to the cursor - when moving vertically using j/k -set so=7 - -" Avoid garbled characters in Chinese language windows OS -let $LANG='en' -set langmenu=en -source $VIMRUNTIME/delmenu.vim -source $VIMRUNTIME/menu.vim - -" Turn on the Wild menu -set wildmenu - -" Ignore compiled files -set wildignore=*.o,*~,*.pyc -if has("win16") || has("win32") - set wildignore+=.git\*,.hg\*,.svn\* -else - set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store -endif - -"Always show current position -set ruler - -" Height of the command bar -set cmdheight=1 - -" A buffer becomes hidden when it is abandoned -set hid - -" Configure backspace so it acts as it should act -set backspace=eol,start,indent -set whichwrap+=<,>,h,l - -" Ignore case when searching -set ignorecase - -" When searching try to be smart about cases -set smartcase - -" Highlight search results -set hlsearch - -" Makes search act like search in modern browsers -set incsearch - -" Don't redraw while executing macros (good performance config) -set lazyredraw - -" For regular expressions turn magic on -set magic - -" Show matching brackets when text indicator is over them -set showmatch -" How many tenths of a second to blink when matching brackets -set mat=2 - -" No annoying sound on errors -set noerrorbells -set novisualbell -set t_vb= -set tm=500 - -" Properly disable sound on errors on MacVim -if has("gui_macvim") - autocmd GUIEnter * set vb t_vb= -endif - - -" Add a bit extra margin to the left -set foldcolumn=1 - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Colors and Fonts -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Enable syntax highlighting -syntax enable - -" Enable 256 colors palette in Gnome Terminal -if $COLORTERM == 'gnome-terminal' - set t_Co=256 -endif - -try - colorscheme desert -catch -endtry - -set background=dark - -" Set extra options when running in GUI mode -if has("gui_running") - set guioptions-=T - set guioptions-=e - set t_Co=256 - set guitablabel=%M\ %t -endif - -" Set utf8 as standard encoding and en_US as the standard language -set encoding=utf8 -set fileencodings=ucs-bom,utf-8,cp936 -set fileencoding=gb2312 -set termencoding=utf-8 - -" Use Unix as the standard file type -set ffs=unix,dos,mac - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Files, backups and undo -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Turn backup off, since most stuff is in SVN, git etc. anyway... -set nobackup -set nowb -set noswapfile - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Text, tab and indent related -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Use spaces instead of tabs -set expandtab - -" Be smart when using tabs ;) -set smarttab - -" 1 tab == 4 spaces -set shiftwidth=4 -set tabstop=4 - -" Linebreak on 500 characters -set lbr -set tw=500 - -set ai "Auto indent -set si "Smart indent -set wrap "Wrap lines - - -"""""""""""""""""""""""""""""" -" => Visual mode related -"""""""""""""""""""""""""""""" -" Visual mode pressing * or # searches for the current selection -" Super useful! From an idea by Michael Naumann -vnoremap * :call VisualSelection('', '')/=@/ -vnoremap # :call VisualSelection('', '')?=@/ - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Moving around, tabs, windows and buffers -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Map to / (search) and Ctrl- to ? (backwards search) -map / -map ? - -" Disable highlight when is pressed -map :noh - -" Smart way to move between windows -map j -map k -map h -map l - -" Close the current buffer -map bd :Bclose:tabclosegT - -" Close all the buffers -map ba :bufdo bd - -map l :bnext -map h :bprevious - -" Useful mappings for managing tabs -map tn :tabnew -map to :tabonly -map tc :tabclose -map tm :tabmove -map t :tabnext - -" Let 'tl' toggle between this and the last accessed tab -let g:lasttab = 1 -nmap tl :exe "tabn ".g:lasttab -au TabLeave * let g:lasttab = tabpagenr() - - -" Opens a new tab with the current buffer's path -" Super useful when editing files in the same directory -map te :tabedit =expand("%:p:h")/ - -" Switch CWD to the directory of the open buffer -map cd :cd %:p:h:pwd - -" Specify the behavior when switching between buffers -try - set switchbuf=useopen,usetab,newtab - set stal=2 -catch -endtry - -" Return to last edit position when opening files (You want this!) -au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif - - -"""""""""""""""""""""""""""""" -" => Status line -"""""""""""""""""""""""""""""" -" Always show the status line -set laststatus=2 - -" Format the status line -set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Editing mappings -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Remap VIM 0 to first non-blank character -map 0 ^ - -" Move a line of text using ALT+[jk] or Command+[jk] on mac -nmap mz:m+`z -nmap mz:m-2`z -vmap :m'>+`mzgv`yo`z -vmap :m'<-2`>my` - nmap - vmap - vmap -endif - -" Delete trailing white space on save, useful for some filetypes ;) -fun! CleanExtraSpaces() - let save_cursor = getpos(".") - let old_query = getreg('/') - silent! %s/\s\+$//e - call setpos('.', save_cursor) - call setreg('/', old_query) -endfun - -if has("autocmd") - autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces() -endif - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Spell checking -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Pressing ,ss will toggle and untoggle spell checking -map ss :setlocal spell! - -" Shortcuts using -map sn ]s -map sp [s -map sa zg -map s? z= - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Misc -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Remove the Windows ^M - when the encodings gets messed up -noremap m mmHmt:%s///ge'tzt'm - -" Quickly open a buffer for scribble -map q :e ~/buffer - -" Quickly open a markdown buffer for scribble -map x :e ~/buffer.md - -" Toggle paste mode on and off -map pp :setlocal paste! - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Helper functions -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Returns true if paste mode is enabled -function! HasPaste() - if &paste - return 'PASTE MODE ' - endif - return '' -endfunction - -" Don't close window, when deleting a buffer -command! Bclose call BufcloseCloseIt() -function! BufcloseCloseIt() - let l:currentBufNum = bufnr("%") - let l:alternateBufNum = bufnr("#") - - if buflisted(l:alternateBufNum) - buffer # - else - bnext - endif - - if bufnr("%") == l:currentBufNum - new - endif - - if buflisted(l:currentBufNum) - execute("bdelete! ".l:currentBufNum) - endif -endfunction - -function! CmdLine(str) - call feedkeys(":" . a:str) -endfunction - -function! VisualSelection(direction, extra_filter) range - let l:saved_reg = @" - execute "normal! vgvy" - - let l:pattern = escape(@", "\\/.*'$^~[]") - let l:pattern = substitute(l:pattern, "\n$", "", "") - - if a:direction == 'gv' - call CmdLine("Ack '" . l:pattern . "' " ) - elseif a:direction == 'replace' - call CmdLine("%s" . '/'. l:pattern . '/') - endif - - let @/ = l:pattern - let @" = l:saved_reg -endfunction - - -"""""""""""""""""""""""""""""" -" => Python section -"""""""""""""""""""""""""""""" -let python_highlight_all = 1 -au FileType python syn keyword pythonDecorator True None False self - -au BufNewFile,BufRead *.jinja set syntax=htmljinja -au BufNewFile,BufRead *.mako set ft=mako - -au FileType python map F :set foldmethod=indent - -au FileType python inoremap $r return -au FileType python inoremap $i import -au FileType python inoremap $p print -au FileType python inoremap $f # --- a -au FileType python map 1 /class -au FileType python map 2 /def -au FileType python map C ?class -au FileType python map D ?def - - -"""""""""""""""""""""""""""""" -" => JavaScript section -""""""""""""""""""""""""""""""" -au FileType javascript call JavaScriptFold() -au FileType javascript setl fen -au FileType javascript setl nocindent - -au FileType javascript imap $log();hi -au FileType javascript imap alert();hi - -au FileType javascript inoremap $r return -au FileType javascript inoremap $f // --- PHFP2xi - -function! JavaScriptFold() - setl foldmethod=syntax - setl foldlevelstart=1 - syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend - - function! FoldText() - return substitute(getline(v:foldstart), '{.*', '{...}', '') - endfunction - setl foldtext=FoldText() -endfunction - - -"""""""""""""""""""""""""""""" -" => CoffeeScript section -""""""""""""""""""""""""""""""" -function! CoffeeScriptFold() - setl foldmethod=indent - setl foldlevelstart=1 -endfunction -au FileType coffee call CoffeeScriptFold() - -au FileType gitcommit call setpos('.', [0, 1, 1, 0]) - - -"""""""""""""""""""""""""""""" -" => Shell section -"""""""""""""""""""""""""""""" -if exists('$TMUX') - if has('nvim') - set termguicolors - else - set term=screen-256color - endif -endif - - -"""""""""""""""""""""""""""""" -" => Twig section -"""""""""""""""""""""""""""""" -autocmd BufRead *.twig set syntax=html filetype=html - - -"""""""""""""""""""""""""""""" -" => Markdown -"""""""""""""""""""""""""""""" -let vim_markdown_folding_disabled = 1 diff --git a/README.md b/README.md index 9a049df8..2f9d9928 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,5 @@ -English | [简体中文](README_ch.md) - -# PaddleSpeech - - -

- +

@@ -16,79 +10,122 @@ English | [简体中文](README_ch.md)
------------------------------------------------------------------------------------ + ![License](https://img.shields.io/badge/license-Apache%202-red.svg) ![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![support os](https://img.shields.io/badge/os-linux-yellow.svg) -**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech, with state-of-art and influential models. +**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech, with the state-of-art and influential models. -Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: -- **Fast and Light-weight**: we provide high-speed and ultra-lightweight models that are convenient for industrial deployment. -- **Rule-based Chinese frontend**: our frontend contains Text Normalization (TN) and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context. -- **Varieties of Functions that Vitalize both Industrial and Academia**: - - *Implementation of critical audio tasks*: this toolkit contains audio functions like Speech Translation (ST), Automatic Speech Recognition (ASR), Text-To-Speech Synthesis (TTS), Voice Cloning(VC), Punctuation Restoration, etc. - - *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model lists](#models-list) for more details. - - *Cross-domain application*: as an extension of the application of traditional audio tasks, we combine the aforementioned tasks with other fields like NLP. +##### Speech-to-Text -Let's install PaddleSpeech with only a few lines of code! +
+ + + + + + + + + + + + + + + + + +
Input Audio Recognition Result
+ +
+
I knocked at the door on the ancient side of the building.
+ +
+
我认为跑步最重要的就是给我带来了身体健康。
->Note: The official name is still deepspeech. 2021/10/26 +
-If you are using Ubuntu, PaddleSpeech can be set up with pip installation (with root privilege). -```shell -git clone https://github.com/PaddlePaddle/DeepSpeech.git -cd DeepSpeech -pip install -e . -``` +##### Text-to-Speech +
+ + + + + + + + + + + + + + + + + +
Input Text Synthetic Audio
Life was like a box of chocolates, you never know what you're gonna get. + +
+
早上好,今天是2020/10/29,最低温度是-3°C。 + +
+
+ +
-## Table of Contents +For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html). -The contents of this README is as follow: -- [Alternative Installation](#alternative-installation) -- [Quick Start](#quick-start) -- [Models List](#models-list) -- [Tutorials](#tutorials) -- [FAQ and Contributing](#faq-and-contributing) -- [License](#license) -- [Acknowledgement](#acknowledgement) +Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: +- **Fast and Light-weight**: we provide high-speed and ultra-lightweight models that are convenient for industrial deployment. +- **Rule-based Chinese frontend**: our frontend contains Text Normalization and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context. +- **Varieties of Functions that Vitalize both Industrial and Academia**: + - *Implementation of critical audio tasks*: this toolkit contains audio functions like Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, Voice Cloning, etc. + - *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. + - *Cascaded models application*: as an extension of the application of traditional audio tasks, we combine the workflows of aforementioned tasks with other fields like Natural language processing (NLP), like Punctuation Restoration. -## Alternative Installation +## Installation The base environment in this page is - Ubuntu 16.04 - python>=3.7 -- paddlepaddle==2.1.2 +- paddlepaddle>=2.2.0 -If you want to set up PaddleSpeech in other environment, please see the [ASR installation](docs/source/asr/install.md) and [TTS installation](docs/source/tts/install.md) documents for all the alternatives. +If you want to set up PaddleSpeech in other environment, please see the [installation](./docs/source/install.md) documents for all the alternatives. ## Quick Start -> Note: the current links to `English ASR` and `English TTS` are not valid. - -Just a quick test of our functions: [English ASR](link/hubdetail?name=deepspeech2_aishell&en_category=AutomaticSpeechRecognition) and [English TTS](link/hubdetail?name=fastspeech2_baker&en_category=TextToSpeech) by typing message or upload your own audio file. Developers can have a try of our model with only a few lines of code. -A tiny **ASR** DeepSpeech2 model training on toy set of LibriSpeech: +A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech: -```bash +```shell cd examples/tiny/s0/ # source the environment source path.sh -# prepare librispeech dataset -bash local/data.sh -# evaluate your ckptfile model file -bash local/test.sh conf/deepspeech2.yaml ckptfile offline +source ../../../utils/parse_options.sh +# prepare data +bash ./local/data.sh +# train model, all `ckpt` under `exp` dir, if you use paddlepaddle-gpu, you can set CUDA_VISIBLE_DEVICES before the train script +./local/train.sh conf/deepspeech2.yaml deepspeech2 offline +# avg n best model to get the test model, in this case, n = 1 +avg.sh best exp/deepspeech2/checkpoints 1 +# evaluate the test model +./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 offline ``` -For **TTS**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC: - -```bash +For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC: +```shell cd examples/csmsc/tts3 # download the pretrained models and unaip them wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip @@ -110,30 +147,25 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt - ``` -If you want to try more functions like training and tuning, please see [ASR getting started](docs/source/asr/getting_started.md) and [TTS Basic Use](/docs/source/tts/basic_usage.md). +If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-To-Speech Quick Start](./docs/source/tts/quick_start.md). -## Models List +## Model List PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models. -ASR module contains *Acoustic Model* and *Language Model*, with the following details: +Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details: -> Note: The `Link` should be code path rather than download links. - - - +
- + @@ -141,76 +173,61 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle - - - - - - - - - - - + + + - + - - - - - - + + - - - - - - - - - + + + + + + + + + - + +
ASR Module TypeSpeech-to-Text Module Type Dataset Model Type Link
Acoustic ModelAishell2 Conv + 5 LSTM layers with only forward direction - Ds2 Online Aishell Model -
2 Conv + 3 bidirectional GRU layers - Ds2 Offline Aishell Model -
Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTCAcoustic ModelAishellDeepSpeech2 RNN + Conv based Models - Conformer Offline Aishell Model + deepspeech2-aishell
Encoder:Conformer, Decoder:Transformer, Decoding method: AttentionTransformer based Attention Models - Conformer Librispeech Model + u2.transformer.conformer-aishell
LibrispeechEncoder:Conformer, Decoder:Transformer, Decoding method: Attention Conformer Librispeech Model
Encoder:Transformer, Decoder:Transformer, Decoding method: Attention LibrispeechTransformer based Attention Models - Transformer Librispeech Model + deepspeech2-librispeech / transformer.conformer.u2-librispeech / transformer.conformer.u2-kaldi-librispeech
Language ModelCommonCrawl(en.00)English Language Model - English Language Model
Baidu Internal CorpusMandarin Language Model Small
AlignmentTHCHS30MFA + mfa-thchs30 +
Language ModelNgram Language Model - Mandarin Language Model Small + kenlm
Mandarin Language Model LargeTIMITUnified Streaming & Non-streaming Two-pass - Mandarin Language Model Large + u2-timit
- -PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow: +PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow: - - - - + + + + @@ -218,15 +235,15 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model - + @@ -243,28 +260,14 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model - - + + - - - - - - - - - - - - - + - - - - - - + + - - - - @@ -309,30 +300,48 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model
TTS Module TypeModel TypeDatasetLink Text-to-Speech Module Type Model Type Dataset Link
Text Frontend - chinese-fronted + tn / g2p
Acoustic ModelAcoustic Model Tacotron2 LJSpeech - tacotron2-vctk + tacotron2-ljspeech
FastSpeech2AISHELL-3FastSpeech2AISHELL-3 / VCTK / LJSpeech / CSMSC - fastspeech2-aishell3 -
VCTK fastspeech2-vctk
LJSpeech fastspeech2-ljspeech
CSMSC - fastspeech2-csmsc + fastspeech2-aishell3 / fastspeech2-vctk / fastspeech2-ljspeech / fastspeech2-csmsc
VocoderVocoder WaveFlow LJSpeech @@ -272,22 +275,10 @@ PaddleSpeech TTS mainly contains three modules: *Text Frontend*, *Acoustic Model
Parallel WaveGANLJSpeech - PWGAN-ljspeech -
VCTKParallel WaveGANLJSpeech / VCTK / CSMSC - PWGAN-vctk -
CSMSC - PWGAN-csmsc + PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc
- ## Tutorials -Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. If you want to focus on the two tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. - -The original ASR module is based on [Baidu's DeepSpeech](https://arxiv.org/abs/1412.5567) which is an independent product named [DeepSpeech](https://deepspeech.readthedocs.io). However, the toolkit aligns almost all the SoTA modules in the pipeline. Specifically, these modules are +Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. + +- [Overview](./docs/source/introduction.md) +- Quick Start + - [Dependencies](./docs/source/dependencies.md) and [Installation](./docs/source/install.md) + - [Quick Start of Speech-to-Text](./docs/source/asr/quick_start.md) + - [Quick Start of Text-to-Speech](./docs/source/tts/quick_start.md) +- Speech-to-Text + - [Models Introduction](./docs/source/asr/models_introduction.md) + - [Data Preparation](./docs/source/asr/data_preparation.md) + - [Data Augmentation Pipeline](./docs/source/asr/augmentation.md) + - [Features](./docs/source/asr/feature_list.md) + - [Ngram LM](./docs/source/asr/ngram_lm.md) +- Text-to-Speech + - [Introduction](./docs/source/tts/models_introduction.md) + - [Advanced Usage](./docs/source/tts/advanced_usage.md) + - [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md) + - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html) +- [Released Models](./docs/source/released_model.md) + +The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components. -* [Data Prepration](docs/source/asr/data_preparation.md) -* [Data Augmentation](docs/source/asr/augmentation.md) -* [Ngram LM](docs/source/asr/ngram_lm.md) -* [Benchmark](docs/source/asr/benchmark.md) -* [Relased Model](docs/source/asr/released_model.md) - -The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components. +## FAQ and Contributing +You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you would like to contribute to this project! -## FAQ and Contributing +## Citation -You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/DeepSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/DeepSpeech/issues)! Also, we highly appreciate if you would like to contribute to this project! +To cite PaddleSpeech for research, please use the following format. +```tex +@misc{ppspeech2021, +title={PaddleSpeech, a toolkit for audio processing based on PaddlePaddle.}, +author={PaddlePaddle Authors}, +howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}}, +year={2021} +} +``` -## License +## License and Acknowledge PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE). -## Acknowledgement - -PaddleSpeech depends on a lot of open source repos. See [references](docs/source/reference.md) for more information. +PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/demos/asr_hub/README.md b/demos/asr_hub/README.md new file mode 100644 index 00000000..19e83f9a --- /dev/null +++ b/demos/asr_hub/README.md @@ -0,0 +1,5 @@ +# ASR + +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh +``` diff --git a/demos/asr_hub/hub_infer.py b/demos/asr_hub/hub_infer.py new file mode 100644 index 00000000..b540be1d --- /dev/null +++ b/demos/asr_hub/hub_infer.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) +parser.add_argument("--wav_en", type=str) +parser.add_argument("--wav_zh", type=str) +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + paddle.set_device(args.device) + + s2t_en_model = hub.Module(name='u2_conformer_librispeech') + s2t_zh_model = hub.Module(name='u2_conformer_aishell') + + args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en)) + args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh)) + + assert os.path.isfile(args.wav_en) and os.path.isfile( + args.wav_zh), 'Wav files not exist.' + + print('[S2T][en]Wav: {}'.format(args.wav_en)) + text_en = s2t_en_model.speech_recognize(args.wav_en) + print('[S2T][en]Text: {}'.format(text_en)) + + print('[S2T][zh]Wav: {}'.format(args.wav_zh)) + text_zh = s2t_zh_model.speech_recognize(args.wav_zh) + print('[S2T][zh]Text: {}'.format(text_zh)) diff --git a/demos/asr_hub/run.sh b/demos/asr_hub/run.sh new file mode 100755 index 00000000..040fc493 --- /dev/null +++ b/demos/asr_hub/run.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +if python -c "import paddlehub" &> /dev/null; then + echo 'PaddleHub has already been installed.' +else + echo 'Installing PaddleHub...' + pip install paddlehub -U +fi + +mkdir -p data +wav_en=data/en.wav +wav_zh=data/zh.wav +test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data +test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi + +echo "using ${device}..." + +python3 -u hub_infer.py \ +--device ${device} \ +--wav_en ${wav_en} \ +--wav_zh ${wav_zh} + +exit 0 diff --git a/demos/echo_hub/.gitignore b/demos/echo_hub/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/demos/echo_hub/.gitignore @@ -0,0 +1 @@ +data diff --git a/demos/echo_hub/README.md b/demos/echo_hub/README.md new file mode 100644 index 00000000..3248f517 --- /dev/null +++ b/demos/echo_hub/README.md @@ -0,0 +1,13 @@ +# echo system + +ASR + TTS + +中文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh +``` + +英文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en +``` diff --git a/demos/echo_hub/hub_infer.py b/demos/echo_hub/hub_infer.py new file mode 100644 index 00000000..abeb409d --- /dev/null +++ b/demos/echo_hub/hub_infer.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import librosa +import paddle +import paddlehub as hub +import soundfile as sf + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en']) +parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) +parser.add_argument("--text", type=str, nargs='+') +parser.add_argument("--output_dir", type=str) +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + paddle.set_device(args.device) + + output_dir = os.path.abspath(os.path.expanduser(args.output_dir)) + if args.lang == 'zh': + t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir) + s2t_model = hub.Module(name='u2_conformer_aishell') + else: + t2s_model = hub.Module( + name='fastspeech2_ljspeech', output_dir=output_dir) + s2t_model = hub.Module(name='u2_conformer_librispeech') + + if isinstance(args.text, list): + args.text = ' '.join(args.text) + + wavs = t2s_model.generate([args.text], device=args.device) + print('[T2S]Wav file has been generated: {}'.format(wavs[0])) + # convert sr to 16k + x, sr = librosa.load(wavs[0]) + y = librosa.resample(x, sr, 16000) + wav_16k = wavs[0].replace('.wav', '_16k.wav') + sf.write(wav_16k, y, 16000) + print('[S2T]Resample to 16k: {}'.format(wav_16k)) + text = s2t_model.speech_recognize(wav_16k) + print('[S2T]Text recognized from wav file: {}'.format(text)) diff --git a/demos/echo_hub/run.sh b/demos/echo_hub/run.sh new file mode 100755 index 00000000..f3e87f2e --- /dev/null +++ b/demos/echo_hub/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +if python -c "import paddlehub" &> /dev/null; then + echo 'PaddleHub has already been installed.' +else + echo 'Installing PaddleHub...' + pip install paddlehub -U +fi + +if [ $# != 2 -a $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi + +echo "using ${device}..." + +text=$1 +output_dir=$2 +if [ $# == 3 ];then + lang=$3 +else + lang=zh +fi + +if [ ! -d $output_dir ];then + mkdir -p $output_dir +fi + +python3 -u hub_infer.py \ +--lang ${lang} \ +--device ${device} \ +--text \"${text}\" \ +--output_dir ${output_dir} + +exit 0 diff --git a/demos/metaverse/Lamarr.png b/demos/metaverse/Lamarr.png new file mode 100644 index 00000000..fba369af Binary files /dev/null and b/demos/metaverse/Lamarr.png differ diff --git a/demos/metaverse/path.sh b/demos/metaverse/path.sh new file mode 100755 index 00000000..169446d3 --- /dev/null +++ b/demos/metaverse/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh new file mode 100755 index 00000000..ea7f683c --- /dev/null +++ b/demos/metaverse/run.sh @@ -0,0 +1,61 @@ +#!/bin/bash +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +mkdir -p download + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # install PaddleGAN + git clone https://github.com/PaddlePaddle/PaddleGAN.git + pip install -e PaddleGAN/ +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download pretrained PaddleGAN model + wget -P download https://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # download pretrained tts models and unzip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + unzip -d download download/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # run tts + CUDA_VISIBLE_DEVICES=${gpus} \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --text=sentences.txt \ + --output-dir=output/wavs \ + --inference-dir=output/inference \ + --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + # output/inference is not needed here, which save the static models + rm -rf output/inference +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # We only test one audio here, cause it's slow + CUDA_VISIBLE_DEVICES=${gpus} \ + python3 PaddleGAN/applications/tools/wav2lip.py \ + --checkpoint_path download/wav2lip_hq.pdparams \ + --face Lamarr.png \ + --audio output/wavs/000.wav \ + --outfile output/tts_lips.mp4 \ + --face_enhancement +fi diff --git a/demos/metaverse/sentences.txt b/demos/metaverse/sentences.txt new file mode 100644 index 00000000..91210dee --- /dev/null +++ b/demos/metaverse/sentences.txt @@ -0,0 +1 @@ +000 谁知青蛙一落地,竟变成了一位英俊的王子。于是遵照国王的意思,他做了公主的亲密伴侣。 diff --git a/demos/story_talker/imgs/000.jpg b/demos/story_talker/imgs/000.jpg new file mode 100644 index 00000000..cdbac0c9 Binary files /dev/null and b/demos/story_talker/imgs/000.jpg differ diff --git a/demos/story_talker/ocr.py b/demos/story_talker/ocr.py new file mode 100644 index 00000000..c479bf10 --- /dev/null +++ b/demos/story_talker/ocr.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import re +from pathlib import Path + +import paddle +from paddleocr import draw_ocr +from paddleocr import PaddleOCR +from PIL import Image + + +def evaluate(args, ocr): + img_dir = Path(args.img_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + img_out_dir = output_dir / "imgs" + img_out_dir.mkdir(parents=True, exist_ok=True) + with open(output_dir / "sentences.txt", "w") as wf: + for name in os.listdir(img_dir): + id = name.split(".")[0] + img_path = img_dir / name + result = ocr.ocr(str(img_path), cls=True) + # draw result + image = Image.open(img_path).convert('RGB') + boxes = [line[0] for line in result] + txts = [line[1][0] for line in result] + scores = [line[1][1] for line in result] + im_show = draw_ocr( + image, boxes, txts, scores, font_path=args.font_path) + im_show = Image.fromarray(im_show) + paragraph = "".join(txts) + # 过滤出中文结果 + pattern = re.compile(r'[^(\u4e00-\u9fa5)+,。?、]') + sentence = re.sub(pattern, '', paragraph) + im_show.save(img_out_dir / name) + wf.write(id + " " + sentence + "\n") + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with fastspeech2 & parallel wavegan.") + parser.add_argument("--img-dir", default="imgs", type=str, help="img_dir.") + parser.add_argument( + "--output-dir", + type=str, + default="output", + help="output sentences path.") + parser.add_argument( + "--font-path", type=str, default="simfang.ttf", help="font path") + args = parser.parse_args() + + paddle.set_device("gpu") + # need to run only once to download and load model into memory + ocr = PaddleOCR(use_angle_cls=True, lang='ch') + + evaluate(args, ocr) + + +if __name__ == "__main__": + main() diff --git a/demos/story_talker/path.sh b/demos/story_talker/path.sh new file mode 100755 index 00000000..169446d3 --- /dev/null +++ b/demos/story_talker/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh new file mode 100755 index 00000000..069ec12e --- /dev/null +++ b/demos/story_talker/run.sh @@ -0,0 +1,50 @@ +#!/bin/bash +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +mkdir -p download + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # install PaddleOCR + pip install "paddleocr>=2.0.1" +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download pretrained tts models and unzip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + unzip -d download download/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # run ocr + CUDA_VISIBLE_DEVICES=${gpus} \ + python3 ocr.py --img-dir=imgs --output-dir=output --font-path=simfang.ttf +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # run tts + CUDA_VISIBLE_DEVICES=${gpus} \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --text=output/sentences.txt \ + --output-dir=output/wavs \ + --inference-dir=output/inference \ + --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + # output/inference is not needed here, which save the static models + rm -rf output/inference +fi diff --git a/demos/story_talker/simfang.ttf b/demos/story_talker/simfang.ttf new file mode 100644 index 00000000..2b59eae4 Binary files /dev/null and b/demos/story_talker/simfang.ttf differ diff --git a/demos/style_fs2/path.sh b/demos/style_fs2/path.sh new file mode 100755 index 00000000..c8401926 --- /dev/null +++ b/demos/style_fs2/path.sh @@ -0,0 +1,12 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh new file mode 100755 index 00000000..f035dd1b --- /dev/null +++ b/demos/style_fs2/run.sh @@ -0,0 +1,38 @@ +#!/bin/bash +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +mkdir -p download + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # download pretrained tts models and unzip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + unzip -d download download/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # run tts + CUDA_VISIBLE_DEVICES=${gpus} \ + python3 style_syn.py \ + --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --fastspeech2-pitch-stat=download/fastspeech2_nosil_baker_ckpt_0.4/pitch_stats.npy \ + --fastspeech2-energy-stat=download/fastspeech2_nosil_baker_ckpt_0.4/energy_stats.npy \ + --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=output \ + --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +fi diff --git a/demos/style_fs2/sentences.txt b/demos/style_fs2/sentences.txt new file mode 100644 index 00000000..91210dee --- /dev/null +++ b/demos/style_fs2/sentences.txt @@ -0,0 +1 @@ +000 谁知青蛙一落地,竟变成了一位英俊的王子。于是遵照国王的意思,他做了公主的亲密伴侣。 diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py new file mode 100644 index 00000000..5b8ce351 --- /dev/null +++ b/demos/style_fs2/style_syn.py @@ -0,0 +1,210 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference +from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator +from paddlespeech.t2s.models.parallel_wavegan import PWGInference +from paddlespeech.t2s.modules.normalizer import ZScore + + +def evaluate(args, fastspeech2_config, pwg_config): + + # construct dataset for evaluation + sentences = [] + with open(args.text, 'rt') as f: + for line in f: + utt_id, sentence = line.strip().split() + sentences.append((utt_id, sentence)) + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + odim = fastspeech2_config.n_mels + model = FastSpeech2( + idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + + model.set_state_dict( + paddle.load(args.fastspeech2_checkpoint)["main_params"]) + model.eval() + + vocoder = PWGGenerator(**pwg_config["generator_params"]) + vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) + vocoder.remove_weight_norm() + vocoder.eval() + print("model done!") + + frontend = Frontend(phone_vocab_path=args.phones_dict) + print("frontend done!") + + stat = np.load(args.fastspeech2_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + fastspeech2_normalizer = ZScore(mu, std) + + stat = np.load(args.pwg_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + pwg_normalizer = ZScore(mu, std) + + fastspeech2_inference = StyleFastSpeech2Inference( + fastspeech2_normalizer, model, args.fastspeech2_pitch_stat, + args.fastspeech2_energy_stat) + fastspeech2_inference.eval() + + pwg_inference = PWGInference(pwg_normalizer, vocoder) + pwg_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + styles = ["normal", "robot", "1.2xspeed", "0.8xspeed", "child_voice"] + for style in styles: + robot = False + durations = None + durations_scale = None + durations_bias = None + pitch = None + pitch_scale = None + pitch_bias = None + energy = None + energy_scale = None + energy_bias = None + if style == "robot": + # all tones in phones be `1` + # all pitch should be the same, we use mean here + robot = True + if style == "1.2xspeed": + durations_scale = 1 / 1.2 + if style == "0.8xspeed": + durations_scale = 1 / 0.8 + if style == "child_voice": + pitch_scale = 1.3 + sub_output_dir = output_dir / style + sub_output_dir.mkdir(parents=True, exist_ok=True) + for utt_id, sentence in sentences: + input_ids = frontend.get_input_ids( + sentence, merge_sentences=True, robot=robot) + phone_ids = input_ids["phone_ids"][0] + + with paddle.no_grad(): + mel = fastspeech2_inference( + phone_ids, + durations=durations, + durations_scale=durations_scale, + durations_bias=durations_bias, + pitch=pitch, + pitch_scale=pitch_scale, + pitch_bias=pitch_bias, + energy=energy, + energy_scale=energy_scale, + energy_bias=energy_bias, + robot=robot) + wav = pwg_inference(mel) + + sf.write( + str(sub_output_dir / (utt_id + ".wav")), + wav.numpy(), + samplerate=fastspeech2_config.fs) + print(f"{style}_{utt_id} done!") + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with fastspeech2 & parallel wavegan.") + parser.add_argument( + "--fastspeech2-config", type=str, help="fastspeech2 config file.") + parser.add_argument( + "--fastspeech2-checkpoint", + type=str, + help="fastspeech2 checkpoint to load.") + parser.add_argument( + "--fastspeech2-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training fastspeech2." + ) + parser.add_argument( + "--fastspeech2-pitch-stat", + type=str, + help="mean and standard deviation used to normalize pitch when training fastspeech2" + ) + parser.add_argument( + "--fastspeech2-energy-stat", + type=str, + help="mean and standard deviation used to normalize energy when training fastspeech2." + ) + parser.add_argument( + "--pwg-config", type=str, help="parallel wavegan config file.") + parser.add_argument( + "--pwg-checkpoint", + type=str, + help="parallel wavegan generator parameters to load.") + parser.add_argument( + "--pwg-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." + ) + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + parser.add_argument( + "--text", + type=str, + help="text to synthesize, a 'utt_id sentence' pair per line.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.fastspeech2_config) as f: + fastspeech2_config = CfgNode(yaml.safe_load(f)) + with open(args.pwg_config) as f: + pwg_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(fastspeech2_config) + print(pwg_config) + + evaluate(args, fastspeech2_config, pwg_config) + + +if __name__ == "__main__": + main() diff --git a/demos/tts_hub/README.md b/demos/tts_hub/README.md new file mode 100644 index 00000000..f5fa599a --- /dev/null +++ b/demos/tts_hub/README.md @@ -0,0 +1,11 @@ +# TTS + +中文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh +``` + +英文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en +``` diff --git a/demos/tts_hub/hub_infer.py b/demos/tts_hub/hub_infer.py new file mode 100644 index 00000000..2430400e --- /dev/null +++ b/demos/tts_hub/hub_infer.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en']) +parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) +parser.add_argument("--text", type=str, nargs='+') +parser.add_argument("--output_dir", type=str) +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + paddle.set_device(args.device) + + output_dir = os.path.abspath(os.path.expanduser(args.output_dir)) + if args.lang == 'zh': + t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir) + else: + t2s_model = hub.Module( + name='fastspeech2_ljspeech', output_dir=output_dir) + + if isinstance(args.text, list): + args.text = ' '.join(args.text) + + wavs = t2s_model.generate([args.text], device=args.device) + print('[T2S]Wav file has been generated: {}'.format(wavs[0])) diff --git a/demos/tts_hub/run.sh b/demos/tts_hub/run.sh new file mode 100755 index 00000000..f3e87f2e --- /dev/null +++ b/demos/tts_hub/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +if python -c "import paddlehub" &> /dev/null; then + echo 'PaddleHub has already been installed.' +else + echo 'Installing PaddleHub...' + pip install paddlehub -U +fi + +if [ $# != 2 -a $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi + +echo "using ${device}..." + +text=$1 +output_dir=$2 +if [ $# == 3 ];then + lang=$3 +else + lang=zh +fi + +if [ ! -d $output_dir ];then + mkdir -p $output_dir +fi + +python3 -u hub_infer.py \ +--lang ${lang} \ +--device ${device} \ +--text \"${text}\" \ +--output_dir ${output_dir} + +exit 0 diff --git a/docs/images/PaddleSpeech_log.png b/docs/images/PaddleSpeech_logo.png similarity index 100% rename from docs/images/PaddleSpeech_log.png rename to docs/images/PaddleSpeech_logo.png diff --git a/docs/source/asr/models_introduction.md b/docs/source/asr/models_introduction.md index 7843a349..eab895ed 100644 --- a/docs/source/asr/models_introduction.md +++ b/docs/source/asr/models_introduction.md @@ -13,7 +13,7 @@ In addition, the training process and the testing process are also introduced. The arcitecture of the model is shown in Fig.1.

- +
Fig.1 The Arcitecture of deepspeech2 online model

@@ -160,7 +160,7 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The The arcitecture of the model is shown in Fig.2.

- +
Fig.2 The Arcitecture of deepspeech2 offline model

diff --git a/docs/source/dependencies.md b/docs/source/dependencies.md new file mode 100644 index 00000000..905730e4 --- /dev/null +++ b/docs/source/dependencies.md @@ -0,0 +1,36 @@ +# The Dependencies + +## By apt-get + +### The base dependencies: + +``` +bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libvorbis-dev libboost-dev swig python3-dev +``` + +### The dependencies of kenlm: + +``` +build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev gcc-5 g++-5 +``` + +### The dependencies of sox: + +``` +libvorbis-dev libmp3lame-dev libmad-ocaml-dev +``` + + +## By make or setup + +``` +kenlm +sox +mfa +openblas +kaldi +sctk +AutoLog +swig-decoder +python_kaldi_features +``` diff --git a/docs/source/index.rst b/docs/source/index.rst index 53e5d15d..ea2599ab 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,7 +23,7 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Speech-To-Text + :caption: Speech-to-Text asr/models_introduction asr/data_preparation @@ -33,7 +33,7 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Text-To-Speech + :caption: Text-to-Speech tts/basic_usage tts/advanced_usage diff --git a/docs/source/install.md b/docs/source/install.md index f7c8c1c0..d68b990d 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -6,52 +6,38 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin - Python >= 3.7 - PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) -## Setup (Important) - -- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox`, and `swig`, e.g. installing them via `apt-get`: - -```bash -sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev -``` -The version of `swig` should >= 3.0 - -or, installing them via `yum`: - -```bash -sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel -wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz -xz -d flac-1.3.1.tar.xz -tar -xvf flac-1.3.1.tar -cd flac-1.3.1 -./configure -make -make install -``` +## Simple Setup -- Run the setup script for the remaining dependencies +For user who working on `Ubuntu` with `root` privilege. -```bash +```python git clone https://github.com/PaddlePaddle/DeepSpeech.git cd DeepSpeech -pushd tools; make virtualenv.done:; popd -source tools/venv/bin/activate pip install -e . ``` -- Source venv before do experiment. +For user who only needs the basic function of paddlespeech, using conda to do installing is recommended. +You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version. -```bash -source tools/venv/bin/activate +```python +pushd tools +bash extras/install_miniconda.sh +popd +bash ``` -## Simple Setup - +After installing the conda, run the setup.sh to complete the installing process. ```python -git clone https://github.com/PaddlePaddle/DeepSpeech.git -cd DeepSpeech -pip install -e . +bash setup.sh ``` + +## Setup (Other Platform) + +- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`. +- The version of `swig` should >= 3.0 +- we will do more to simplify the install process. + ## Running in Docker Container (optional) Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. diff --git a/docs/source/introduction.md b/docs/source/introduction.md index 2f71b104..e3fc8b9e 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -1,14 +1,37 @@ # PaddleSpeech ## What is PaddleSpeech? -PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models. +PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models. ## What can PaddleSpeech do? -### Speech-To-Text -(An introduce of ASR in PaddleSpeech is needed here!) +### Speech-to-Text +PaddleSpeech ASR mainly consists of components below: +- Implementation of models and commonly used neural network layers. +- Dataset abstraction and common data preprocessing pipelines. +- Ready-to-run experiments. + +PaddleSpeech ASR provides you with a complete ASR pipeline, including: +- Data Preparation + - Build vocabulary + - Compute Cepstral mean and variance normalization (CMVN) + - Featrue extraction + - linear + - fbank (also support kaldi feature) + - mfcc +- Acoustic Models + - Deepspeech2 (Streaming and Non-Streaming) + - Transformer (Streaming and Non-Streaming) + - Conformer (Streaming and Non-Streaming) +- Decoder + - ctc greedy search (used in DeepSpeech2, Transformer and Conformer) + - ctc beam search (used in DeepSpeech2, Transformer and Conformer) + - attention decoding (used in Transformer and Conformer) + - attention rescoring (used in Transformer and Conformer) + +Speech-to-Text helps you training the ASR model very simply. -### Text-To-Speech +### Text-to-Speech TTS mainly consists of components below: - Implementation of models and commonly used neural network layers. - Dataset abstraction and common data preprocessing pipelines. @@ -30,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including: - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis - GE2E -Text-To-Speech helps you to train TTS models with simple commands. +Text-to-Speech helps you to train TTS models with simple commands. diff --git a/docs/source/reference.md b/docs/source/reference.md index 39c7afe5..01bc5ddc 100644 --- a/docs/source/reference.md +++ b/docs/source/reference.md @@ -1,6 +1,6 @@ # Reference -We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work: +We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work and opensource community! * [espnet](https://github.com/espnet/espnet/blob/master/LICENSE) - Apache-2.0 License diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 3b60f15a..78f5c92f 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,15 +1,17 @@ + # Released Models -## Speech-To-Text Models +## Speech-to-Text Models ### Acoustic Model Released in paddle 2.X -Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech -:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- -[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h -[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h -[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h -[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h -[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h -[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h +Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link +:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :----------- +[Ds2 Online Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/s0) +[Ds2 Offline Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/s0) +[Conformer Online Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1) +[Conformer Offline Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1) +[Conformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1) +[Transformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1) +[Transformer Librispeech S2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2) ### Acoustic Model Transformed from paddle 1.8 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech @@ -26,30 +28,31 @@ Language Model | Training Data | Token-based | Size | Descriptions [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings -## Text-To-Speech Models +## Text-to-Speech Models ### Acoustic Models -Model Type | Dataset| Example Link | Pretrained Models -:-------------:| :------------:| :-----: | :----- -Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) -TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) -SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) -FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) -FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) -FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) - +Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) +:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: +Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)||| +TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)||| +SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| +FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| +FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders -Model Type | Dataset| Example Link | Pretrained Models -:-------------:| :------------:| :-----: | :----- -WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) -Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip.](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) -Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) -Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip) +Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) +:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: +WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)||| +Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB| +Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)||| +Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)||| +Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)||| +|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB| ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models -:-------------:| :------------:| :-----: | :----- -GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) -GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip) +:-------------:| :------------:| :-----: | :-----: +GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) +GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip) diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index 09c4d25a..f47c0892 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -642,8 +642,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog Multi-Speaker TTS ------------------- -PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset. - +PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset. Each line is a different person. .. raw:: html @@ -651,19 +650,381 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
- - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Text Origin Target Timbre Generated
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +


-Duration control in FastSpeech2 +Style control in FastSpeech2 -------------------------------------- -In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce. +In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``. + +We provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce. The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios. @@ -892,6 +1253,174 @@ The duration control in FastSpeech2 can control the speed of audios will keep th

+We provide the audio demos of pitch control here. + +When we set pitch of one sentence to a mean value and set ``tones`` of phones to ``1``, we will get a ``robot-style`` timbre. + +When we raise the pitch of an adult female (with a fixed scale ratio), we will get a ``child-style`` timbre. + +The ``pitch`` of different phonemes in a sentence can also have different scale ratios. + +The nomal audios are in the second column of the previous table. + +.. raw:: html + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Robot Child
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ + + +
+
+
+
+ Chinese TTS with/without text frontend -------------------------------------- diff --git a/docs/source/tts/gan_vocoder.md b/docs/source/tts/gan_vocoder.md index 4931f307..f4af3a90 100644 --- a/docs/source/tts/gan_vocoder.md +++ b/docs/source/tts/gan_vocoder.md @@ -6,4 +6,4 @@ Model | Generator Loss |Discriminator Loss Parallel Wave GAN| adversial loss
Feature Matching | Multi-Scale Discriminator | Mel GAN |adversial loss
Multi-resolution STFT loss | adversial loss| Multi-Band Mel GAN | adversial loss
full band Multi-resolution STFT loss
sub band Multi-resolution STFT loss |Multi-Scale Discriminator| -HiFi GAN |adversial loss
Feature Matching
Mel-Spectrogram Loss | Multi-Scale Discriminator
Multi-Period Discriminato | +HiFi GAN |adversial loss
Feature Matching
Mel-Spectrogram Loss | Multi-Scale Discriminator
Multi-Period Discriminator| diff --git a/docs/source/tts/models_introduction.md b/docs/source/tts/models_introduction.md index b1329758..edf6f312 100644 --- a/docs/source/tts/models_introduction.md +++ b/docs/source/tts/models_introduction.md @@ -27,14 +27,14 @@ At present, there are two mainstream acoustic model structures. - Acoustic decoder (N Frames - > N Frames).
-
+
- Sequence to sequence acoustic model: - M Tokens - > N Frames.
-
+
### Tacotron2 @@ -54,7 +54,7 @@ At present, there are two mainstream acoustic model structures. - CBHG postprocess. - Vocoder: Griffin-Lim.
-
+
**Advantage of Tacotron:** @@ -89,10 +89,10 @@ At present, there are two mainstream acoustic model structures. - The alignment matrix of previous time is considered at the step `t` of decoder.
-
+
-You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0). +You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0). ### TransformerTTS **Disadvantages of the Tacotrons:** @@ -118,7 +118,7 @@ Transformer TTS is a combination of Tacotron2 and Transformer. - Positional Encoding.
-
+
#### Transformer TTS @@ -138,7 +138,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2. - Uniform scale position encoding may have a negative impact on input or output sequences.
-
+
**Disadvantages of Transformer TTS:** @@ -146,7 +146,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2. - The ability to perceive local information is weak, and local information is more related to pronunciation. - Stability is worse than Tacotron2. -You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1). +You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1). ### FastSpeech2 @@ -184,14 +184,14 @@ Instead of using the encoder-attention-decoder based architecture as adopted by • Can be generated in parallel (decoding time is less affected by sequence length)
-
+
#### FastPitch [FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.
-
+
#### FastSpeech2 @@ -209,10 +209,10 @@ Instead of using the encoder-attention-decoder based architecture as adopted by FastSpeech2 is similar to FastPitch but introduces more variation information of speech.
-
+
-You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2. +You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2. ### SpeedySpeech [SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure. @@ -223,10 +223,10 @@ You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example - Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.
-
+
-You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2). +You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2). ## Vocoders In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform. @@ -276,7 +276,7 @@ Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Paralle - It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M). - It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development. -You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0). +You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0). ### Parallel WaveGAN [Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method. @@ -289,7 +289,7 @@ You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examp - Multi-resolution STFT loss.
-
+
-You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1). +You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1). diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md index 9fd92338..ba783194 100644 --- a/docs/source/tts/quick_start.md +++ b/docs/source/tts/quick_start.md @@ -18,7 +18,7 @@ The models in PaddleSpeech TTS have the following mapping relationship: ## Quick Start -Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc) +Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc) ### Train Parallel WaveGAN with CSMSC - Go to directory diff --git a/docs/source/tts/zh_text_frontend.md b/docs/source/tts/zh_text_frontend.md index 26255ad3..b619e4a1 100644 --- a/docs/source/tts/zh_text_frontend.md +++ b/docs/source/tts/zh_text_frontend.md @@ -1,5 +1,5 @@ # Chinese Rule Based Text Frontend -A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/text_frontend). +A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmles in [examples/other/tn](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tn) and [examples/other/g2p](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/g2p). A text frontend module mainly includes: - Text Segmentation diff --git a/docs/topic/ctc/ctc_loss.ipynb b/docs/topic/ctc/ctc_loss.ipynb new file mode 100644 index 00000000..c0da1f32 --- /dev/null +++ b/docs/topic/ctc/ctc_loss.ipynb @@ -0,0 +1,407 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "piano-accent", + "metadata": {}, + "source": [ + "# Derivative of CTC Loss\n", + "\n", + "关于CTC的介绍已经有很多不错的教程了,但是完整的描述CTCLoss的前向和反向过程的很少,而且有些公式推导省略和错误。本文主要关注CTC Loss的梯度是如何计算的,关于CTC的介绍这里不做过多赘述,具体参看文末参考。\n", + "\n", + "CTC主要应用于语音和OCR中,已语音[Deepspeech2](https://arxiv.org/abs/1512.02595)模型为例,CTC的网络一般如下图所示,包含softmax和CTCLoss两部分。反向传播需要求得loss L相对于logits $u^i$​的梯度。下面先介绍CTCLoss的前向计算。\n", + "\n", + "> 图片来源于文末参考\n", + "\n", + "![img](./img/ctc_loss_backward_1.png)\n", + "\n", + "## CTC Loss 的计算\n", + "\n", + "CTC中path的定义与概率的计算如下:\n", + "\n", + "\"image-20211104200811966\"\n", + "\n", + "path 是 $ L'^T$​​的元素,用 $ \\pi $​​表示。 $ \\textbf{x} $​​ 是输入特征,$\\textbf{y}$​​ 是输出label, 都是序列。 $ L $​​ 是输出的 vocab, L‘ 是 $ L \\cup {blank}$​​。 $y_{\\pi_{t}}^t$​​ 表示在t时刻,$\\pi_{t}$​​ label时的观察概率。其中$\\pi_{t}$​​ 表示 $\\pi$​​ path在t时刻的label。$\\pi$​​ 是 $\\textbf{y}$​​ 与 $ \\textbf{x}$​​ 的一个alignment,长度是$T$​​,取值空间为$L'$​​​。path也称为alignment。\n", + "\n", + "公式(2)解释了给定输入 $\\textbf{x}$​ ,输出 $ \\pi $​ path 的概率,即从时间t=1到T每个时间点的概率 $y_{\\pi_{t}}^t$​ 相乘。\n", + "\n", + "求出单条path后,就可以计算$p(l \\mid x)$​ 的概率,计算如下:\n", + "\n", + "\"image-20211104202358513\"\n", + "\n", + "这里边 $\\mathcal{B}$ 就是映射, 即所有多对一的映射(many-to-one mapping )的集合。 这样就算出来对应一个真正的 label $\\textbf{l}$ 的概率了,这里是求和。 求和的原因就是 aab 和 abb 都是对应成ab, 所以 aab 的概率 + abb 的概率才是生成ab的概率。 \n", + "\n", + "公式(3)解释了给定输入 $\\mathbf{x}$​​​​​​ ,求输出$\\mathbf{l}$​​​​​​ 的概率, 即所有集合 $\\mathcal{B}^{-1} (\\mathbf{l})$​​​​​​​​​​ 中 path的概率和。\n", + "\n", + "### CTC forward-backward 算法\n", + "\n", + "CTC的优化采用算最大似然估计[MLE (maximum likelihood estimation)](https://en.wikipedia.org/wiki/Maximum_likelihood_estimation), 这个和神经网络本身的训练过程是一致的。\n", + "\n", + "这个CTC 计算过程类似HMM的 [forward-backward algorithm](https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm),下面就是这个算法的推导过程:\n", + "\n", + "\"image-20211104203040307\"\n", + "\n", + "上图中的定义很清楚, 但是$ \\alpha_{t-1}(s) $ and $ \\alpha_{t-1}(s-1)$ 和 $\\alpha_t(s)$ 的关系也不那么好看出来,下图给出了具体的关于 $\\alpha_t(s)$ 的推导过程:\n", + "\n", + "\"image-20211108155714843\"\n", + "\n", + "\"image-20211109153011816\"\n", + "\n", + "这里的公式比较适合用下面的图来理解,$\\alpha_1(1)$​​​​ 其实对应的就是下图中左上角白色的圆圈。 就是上来第一个是blank 的概率, 而 $\\alpha_1(2)$​​​​是label l 的第一个字母。 这里边我们假设每个字母之间都插入了空白,即label l扩展成l',例如,l=[a, b, b, c], l'=[-, a, -, b, -, b, -, c, -]。 然后对于其他圆点,在时间是1 的情况下概率都是 0. Figure 3中横轴是时间 t,从左到右是1到T;纵轴是s(sequence),从上到下是 1 到 $\\mathbf{\\mid l' \\mid}$​​​​.\n", + "\n", + "\"image-20211108155918442\"\n", + "\n", + "接下来我们分析递归公式 (resursion),更多介绍可以参看 [2]. 公式6分情况考虑:\n", + "\n", + "* 第一种情况就是当前的label是blank, 或者 $\\mathbf{l'}_{s}= \\mathbf{l'}_{s-2}$​​​​​​​(相邻是重复字符):\n", + "\n", + " ![img](https://distill.pub/2017/ctc/assets/cost_no_skip.svg)\n", + "\n", + " 这个时候他的概率来自于过去t-1的两个label 概率, 也就是 $a_{t-1} (s)$​​ 和 $a_{t-1} (s-1)$​​​ 。\n", + "\n", + " $ a_{t-1} (s)$​​ 就是说当前的 sequence 已经是s 了,figure 3中表现为横跳, blank -->blank(例如t=3, s=3);\n", + "\n", + " 而 $a_{t-1} (s-1) $是说明当前的字符还不够, 需要再加一个, 所以在figure 3中就是斜跳,从黑色圆圈到白色圆圈(例如,t=3, s=5)。\n", + "\n", + " 仔细观察figure 3, 除了第一排的白色圆圈, 其他白色圆圈都有两个输入, 就是上述的两种情况。 当然判断blank 的方法也可以是判断$I'_{s-2} = I'_{s}$​. 这种情况也是说明$I'_{s}$​​​ 是blank, 因为每一个字符必须用 blank 隔开, 即使是相同字符。\n", + "\n", + "* 第二章情况 也可以用类似逻辑得出, 只不过当前的状态s 是黑色圆圈, 有三种情况输入。\n", + "\n", + " ![img](https://distill.pub/2017/ctc/assets/cost_regular.svg)\n", + "\n", + "最终的概率就如公式8 所示, 这个计算过程就是 CTC forward algroirthm, 基于 Fig. 3 的左边的初始条件。\n", + "\n", + "\"image-20211108162544982\"\n", + "\n", + "基于Fig. 3 右边的初始条件,我们还是可以计算出一个概率, 那个就是 **CTC backward**. 这里我就不详细介绍了, 直接截图。\n", + "\n", + "\"image-20211108162859876\"\n", + "\n", + "这样一直做乘法, 数字值越来越小,很快就会underflow。 这个时候就需要做 scaling.\n", + "\n", + "\"image-20211108163526616\"\n", + "\n", + "算出了forward probability 和 backward probability 有什么用呢, 解释如下图。\n", + "\n", + "\"image-20211108164110404\"\n", + "\n", + "上图是说 forward probability and backward probability 的乘积, 代表了这个 sequence $\\mathbf{l}$ t时刻,是s label 的 所有paths 的概率。 这样的话 我们就计算了 Fig. 3 中的每个圆圈的概率。为什么$\\alpha_t(s)\\beta_t(s)$ 中多出一个 $y^t_{\\mathbf{l'_s}}$ ,这是因为它在 $\\alpha$ 和 $\\beta$ 中都包含该项,合并公式后就多出一项。\n", + "\n", + "\"image-20211109143104052\"\n", + "\n", + "$p(\\mathbf{l}|\\mathbf{x})$​ 可以通过任意时刻 t 的所有 s 的 foward-backward 概率计算得来。取负对数后就是单个样本的NLL(Negative Log Likelihood)。\n", + "\n", + "### 总结\n", + "\n", + "总结一下,根据前向概率计算CTCLoss函数,可以得出如下结论:\n", + "\n", + "1. 对于时序长度为T的输入序列x和输出序列z,前向概率:\n", + " $$\n", + " \\begin{split}\n", + " \\alpha_t(s) &= \\sum_{ \\underset{\\pi_t=l'_s}{\\pi \\in \\mathcal{B}^{-1}(z)} } p(\\pi_{1:t}|x) \\newline\n", + " \\alpha_1(1) &= y_{-}^1 ; \\quad \\alpha_1(2)=y^1_{l'_2}, \\quad \\alpha_1(s)=0, \\forall s > 2 \\newline\n", + " \\alpha_t(s) &= 0, \\quad \\forall s < |l'| - 2(T-t) - 1 ,\\quad \\text{or} \\quad \\forall s < 1 \\newline\n", + " \\alpha_t(s) &=\n", + " \\begin{cases}\n", + " (\\alpha_{t-1}(s) + \\alpha_{t-1}(s-1) ) y^t_{l'_s} & \\text{if $l'_s=b$ or $l'_{s-2} = l'_s$​} \\newline\n", + " (\\alpha_{t-1}(s) + \\alpha_{t-1}(s-1) + \\alpha_{t-1}(s-2))y^t_{l'_s} & \\text{otherwise}\\newline\n", + " \\end{cases} \n", + " \\end{split}\n", + " $$\n", + "\n", + "2. 利用 $\\alpha_t(s)$ 计算CTCLoss:\n", + " $$\n", + " -ln(p(l \\mid x)) = -ln(\\alpha_{T}(|l'|)+\\alpha_{T}(|l'|-1))\n", + " $$\n", + "\n", + "根据后向概率计算CTCLoss函数,可以得出如下结论:\n", + "\n", + "1. 对于时序长度为T的输入序列x和输出序列z,后向概率: \n", + " $$\n", + " \\begin{split}\n", + " \\beta_t(s) &= \\sum_{ \\underset{\\pi_t=l'_s}{\\pi \\in \\mathcal{B}^{-1}(z)} } p(\\pi_{t:T}|x) \\newline\n", + " \\beta_T(|l'|) &= y_{-}^T ; \\quad \\beta_T(|l'|-1)=y^T_{l'_{|l'|-1}}, \\quad \\beta_T(s)=0, \\forall s < |l'| - 1 \\newline\n", + " \\beta_t(s) &= 0, \\text{$\\forall s > 2t$ or $\\forall s < |l'|$} \\newline\n", + " \\beta_t(s) &=\n", + " \\begin{cases}\n", + " (\\beta_{t+1}(s) + \\beta_{t+1}(s+1) ) y^t_{l'_s} & \\text{if $l'_s=b$ or $l'_{s+2} = l'_s$} \\newline\n", + " (\\beta_{t+1}(s) + \\beta_{t+1}(s+1) + \\beta_{t+1}(s+2))y^t_{l'_s} & \\text{otherwise}\\newline\n", + " \\end{cases}\n", + " \\end{split}\n", + " $$\n", + "\n", + " 2. 利用 $\\beta_t(s)$计算CTCLoss:\n", + "\n", + "$$\n", + "-ln(p(l \\mid x)) = -ln(\\beta_{1}(1)+\\beta_{1}(2)) \\newline\n", + "$$\n", + "\n", + "根据任意时刻的前向概率和后向概率计算CTC Loss函数,得到如下结论:\n", + "\n", + "1. 对于任意时刻t,利用前向概率和后向概率计算CTCLoss:\n", + "\n", + "$$\n", + "p(l \\mid x) = \\sum_{s=1}^{|l'|} \\frac{\\alpha_t(s)\\beta_t(s)}{y_{l'_s}^t} \\newline\n", + "-ln(p(l \\mid x)) = -ln( \\sum_{s=1}^{|l'|} \\frac{\\alpha_t(s) \\beta_t(s)}{y_{l'_s}^t} )\n", + "$$\n", + "我们已经得到CTCLoss的计算方法,接下来对其进行求导。\n" + ] + }, + { + "cell_type": "markdown", + "id": "viral-fitting", + "metadata": {}, + "source": [ + "## CTC梯度计算\n", + "\n", + "### 微分公式\n", + "\n", + "在计算梯度前,我们先回顾下基本的微分公式: \n", + "$$\n", + "C' = 0 \\\\\n", + "x' = 1 \\newline\n", + "x^n = n \\cdot x^{n-1} \\newline\n", + "(e^x)' = e^x \\newline\n", + "log(x)' = \\frac{1}{x} \\newline\n", + "(u + v)' = u' + v' \\newline\n", + "(\\frac{u}{v})' = \\frac{u'v-uv'}{v^2} \\newline\n", + "\\frac{\\mathrm{d}f(g(x))}{\\mathrm{d}x} = \\frac{\\mathrm{d}f(g(x))}{\\mathrm{d}g(x)} \\cdot \\frac{\\mathrm{d}g(x)}{\\mathrm{d}x}\n", + "$$\n" + ] + }, + { + "cell_type": "markdown", + "id": "starting-sender", + "metadata": {}, + "source": [ + "### CTC梯度\n", + "\n", + "最大似然估计训练就是最大化训练集中每一个分类的对数概率,即最小化Eq. 12。\n", + "\n", + "\"image-20211108164206136\"\n", + "\n", + "最后就是算微分了, 整个推导过程就是加法和乘法, 都可以微分。 $\\mathit{O}^{ML}$关于神经网络的输出 $y^t_k$的梯度见Eq. 13。因为训练样本是相互独立的,所以可以单独考虑每个样本,公式如Eq.13。\n", + "\n", + "下面是CTCLoss的梯度计算:\n", + "\n", + "\"image-20211109143622448\"\n" + ] + }, + { + "cell_type": "markdown", + "id": "stretch-order", + "metadata": {}, + "source": [ + "### CTC梯度推导\n", + "\n", + "回顾下之前的公式,便于理解后续推导过程。 \n", + "\n", + "$$\n", + "p(l \\mid x) = \\sum_{s=1}^{|l'|} \\frac{\\alpha_t(s)\\beta_t(s)}{y_{l'_s}^t} \\\\\n", + "\\begin{equation}\n", + "\\alpha_t(s) \\beta_t(s) = \\sum_{ \\underset{\\pi_t=l'_s}{\\pi \\in \\mathcal{B}^{-1}(l):} } y^t_{l'_s} \\prod_{t=1}^T y^t_{\\pi_t}\n", + "\\end{equation}\n", + "$$\n", + "\n", + "其中Eq. 15的计算过程如下: \n", + "\n", + "$$\n", + "\\begin{align*}\n", + "\\frac{\\partial p(\n", + "l \\mid x)}{\\partial y_k^t}\n", + " & = \\sum_{s \\in lab(z,k)} \\frac{ \\partial \\frac{ \\alpha_t(s) \\beta_t(s)}{y_{k}^t}}{\\partial y_k^t} \n", + " \\newline\n", + " & = \\sum_{s \\in lab(z,k)} \\frac{(\\alpha_t(s)\\beta_t(s))’y_k^t - \\alpha_t(s)\\beta_t(s){y_k^t}'}{{y_k^t}^2}\n", + " \\newline\n", + " &= \\sum_{s \\in lab(z,k)} \\frac{( \\prod_{t'=1}^{t-1} y^{t'}_{\\pi_{t'}} \\cdot y_k^t \\cdot y_k^t \\cdot \\prod_{t'=t+1}^{T} y^{t'}_{\\pi_{t'}} )’ y_k^t - \\alpha_t(s)\\beta_t(s){y_k^t}'}{{y_k^t}^2}\n", + " \\newline\n", + " &= \\sum_{s \\in lab(z,k)} \\frac{2\\alpha_t(s)\\beta_t(s) - \\alpha_t(s)\\beta_t(s)}{{y_k^t}^2}\n", + " \\newline\n", + " &= \\sum_{s \\in lab(z,k)} \\frac{\\alpha_t(s)\\beta_t(s)}{{y_k^t}^2}\n", + " \\newline\n", + " &= \\frac{1}{{y_k^t}^2} \\sum_{s \\in lab(z,k)} \\alpha_t(s)\\beta_t(s) \\tag{1} \\newline\n", + "\\end{align*}\n", + "$$\n", + "\n", + "\n", + "NLL的公式推导如下:\n", + "$$\n", + "\\begin{split}\n", + "\\frac{\\partial {ln(p(l \\mid x))} }{ \\partial y^t_k }\n", + " &= \\frac{1}{p(l \\mid x)} \\frac{ \\partial{p(l \\mid x)} }{ \\partial y_k^t } \\newline\n", + " &= \\frac{1}{p(l \\mid x) {y^t_k}^2 } \\sum_{s \\in lab(z,k)} \\alpha_t(s)\\beta_t(s) \n", + "\\end{split}\n", + "\\tag{2}\n", + "$$\n", + "\n", + "\n", + "已经算出了CTCLoss对于 $y_k^t$​ 的梯度,接下来我们需要计算 CTCLoss对于$u^t_k$​(logits)的梯度。套用链式法则,并替换$y^t_k$​ 为 $y^t_{k'}$​,结果如下图。图中 $k'$​ 表示vocab中的某一个token,$K$​​ 是vocab的大小。\n", + "\n", + "![](./img/ctc_loss_backward_2.png)\n", + "\n", + "图中公式4根据链式法则得到:\n", + "$$\n", + "- \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k }\n", + " = - \\sum_{k'=1}^{K} \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial y^t_{k'} } \\frac{ \\partial y^t_{k'} }{ \\partial u^t_k } \\tag{4}\n", + "$$\n", + "图中公式3是softmax的梯度,参考 [4],计算过程如下:\n", + "$$\n", + "softmax(j) = S_j = \\frac{ e^{a_j} }{ \\sum_{k=1}^K e^{a_k} }, \\enspace \\forall j \\in 1 \\dots K\n", + "$$\n", + "\n", + "$$\n", + "\\begin{split}\n", + "\\frac{ \\partial S_i }{ \\partial a_j}\n", + " &= \\frac{ \\partial (\\frac{ e^{ a_i } }{ \\sum_k e^{ a_k } }) } { \\partial a_j }\n", + " \\newline\n", + " &= \n", + " \\begin{cases}\n", + " \t\\frac{ e^a_i \\sum - e^a_j e^a_i }{ \\sum^2 } \n", + " \t&= \\frac{ e^a_i }{ \\sum } \\frac{ \\sum - e^a_j }{ \\sum } \\newline\n", + " &= S_i(1-S_j) & \\text{i = j, $\\sum$ stands for $\\sum_{k=1}^K e^a_k$} \n", + " \t\\newline\n", + " \t\\frac{ 0 - e^a_j e^a_i }{ \\sum^2 } \n", + " \t&= - \\frac{ e^a_j }{ \\sum } \\frac{ e^a_i }{ \\sum } \\newline\n", + " &= -S_j S_i & \\text{i $\\neq$ j, $\\sum$ stands for $\\sum_{k=1}^K e^a_k$}\n", + " \\end{cases}\n", + " \\newline\n", + " &= \n", + " \\begin{cases}\n", + " S_i(1 - S_j) & \\text{$i = j$} \n", + " \\newline\n", + " -S_j S_i = S_i (0 - S_j) & \\text{$i \\neq j$}\n", + " \\end{cases}\n", + " \\newline\n", + " &= S_i (\\delta_{ij} - S_j )\n", + "\\end{split}\n", + "\\tag{3}\n", + "$$\n", + "$$\n", + "\\delta_{ij} =\n", + " \\begin{cases}\n", + " 1 & \\text{if i = j} \\newline\n", + " 0 & \\text{otherwise}\n", + " \\end{cases}\n", + "$$\n", + "\n", + "\n", + "\n", + "下图中黄色框中的部分表示公式(1),即遍历所有的vocab中的token,其结果是$p(l \\mid x)$​。这是因为label $l$​ 中的token一定在vocab中,且 $s \\in lab(l, k')$​ 可以是空集。当 $k'$​ 在 l 中,s 则为label中token是$k'$​的概率;当$k'$​​​不在l中,s为空,概率为0。\n", + "\n", + "![img](./img/ctc_loss_backward_3.png)\n", + "\n", + "公式(2),(3)带入(4),并结合公式(1)的结果如上图右边,即:\n", + "$$\n", + "\\begin{split}\n", + "- \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k } &= \n", + "\t- \\sum_{k'=1}^K \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial y^t_{k'} } \\frac{ \\partial y^t_{k'}}{ \\partial u^t_k } \\newline\n", + "\t&= - \\sum_{k'=1}^K \\frac{ y^t_{k'}( \\delta_{kk'} - y^t_k ) }{ p(l \\mid x) {y^t_{k'}}^2 } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n", + "\t&= - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} - y^t_k }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n", + "\t&= \\sum_{k'=1}^K \\frac{ y^t_k - \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n", + "\t&= \\sum_{k'=1}^K \\frac{ y^t }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n", + "\t&= \\frac{ y^t_k }{ p(l \\mid x) } ( \\sum_{k'=1}^K \\frac{1}{y^t_{k'}} \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) ) - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n", + "\t&= \\frac{ y^t_k }{ p(l \\mid x) } p(l \\mid x) - \\sum_{k'=1}^K \\frac{ \\delta_{kk'} }{ p(l \\mid x) y^t_{k'} } \\sum_{s \\in lab(l, k') } \\alpha_t(s) \\beta_t(s) \\newline\n", + "\t&= y^t_k - \\frac{ 1 }{ p(l \\mid x) y^t_k } \\sum_{s \\in lab(l, k)} \\alpha_t(s) \\beta_t(s) \\newline\n", + "\\end{split}\n", + "$$\n", + "最终,为了通过softmax层传播CTCLoss的梯度,需要计算目标函数与 logits $u^t_k$ 的偏微分,即Eq. 16: \n", + " $$\n", + " \\begin{align*}\n", + " \\hat{\\alpha}_t(s) & \\overset{def}{=} \\frac{ \\alpha_t(s) }{ C_t } ,\\enspace C_t \\overset{def}{=} \\sum_s \\alpha_t(s) \n", + " \\newline\n", + " \\hat{\\beta}_t(s) & \\overset{def}{=} \\frac{ \\beta_t(s) }{ D_t } ,\\enspace D_t \\overset{def}{=} \\sum_s \\beta_t(s) \n", + " \\newline\n", + " - \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k } &= y^t_k - \\frac{1}{y^t_k \\sum_{s=1}^{\\mid l' \\mid} \\frac{ \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) }{ y^t_{l'_s} } } \\sum_{s \\in lab(l, k)} \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) \\tag{16} \n", + " \\newline\n", + " \\end{align*}\n", + " $$" + ] + }, + { + "cell_type": "markdown", + "id": "informative-maria", + "metadata": {}, + "source": [ + "### 总结\n", + "\n", + "* 通过动态规划算法计算$\\alpha_t(s)$ 和 $\\beta_t(s)$\n", + "\n", + "* 通过$\\alpha_t(s)$ 计算 $p(l \\mid x)=\\alpha_T(\\mid l' \\mid) + \\alpha_T(\\mid l' \\mid -1)$\n", + "\n", + "* 通过$\\alpha_t(s)$ 和 $\\beta_t(s)$\n", + "\n", + "* 计算CTcLoss函数的导数: \n", + " $$\n", + " \\begin{split}\n", + " - \\frac{ \\partial ln(p(l \\mid x)) }{ \\partial u^t_k } \n", + " &= y^t_k - \\frac{ 1 }{ p(l \\mid x) y^t_k } \\sum_{s \\in lab(l, k)} \\alpha_t(s) \\beta_t(s) \n", + " \\newline\n", + " &= y^t_k - \\frac{1}{y^t_k \\sum_{s=1}^{\\mid l' \\mid} \\frac{ \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) }{ y^t_{l'_s} } } \\sum_{s \\in lab(l, k)} \\hat{\\alpha}_t(s) \\hat{\\beta}_t(s) \n", + " \\newline\n", + " \\end{split}\n", + " \\tag{16}\n", + " $$" + ] + }, + { + "cell_type": "markdown", + "id": "coordinated-music", + "metadata": {}, + "source": [ + "## Reference\n", + "\n", + "[[1] A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber. Connectionist Temporal lassification: Labeling Unsegmented Sequence Data with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)\n", + "\n", + "[[2] Sequence ModelingWith CTC](https://distill.pub/2017/ctc/)\n", + "\n", + "[[3] NLP 之 CTC Loss 的工作原理](https://www.jianshu.com/p/e073c9d91b20)\n", + "\n", + "[[4] The Softmax function and its derivative](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)\n", + "\n", + "[[5] CTC Algorithm Explained Part 1:Training the Network(CTC算法详解之训练篇)](https://xiaodu.io/ctc-explained/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "closing-candy", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/topic/ctc/img/ctc_loss_alpha_definition.png b/docs/topic/ctc/img/ctc_loss_alpha_definition.png new file mode 100644 index 00000000..c6c35cd3 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_alpha_definition.png differ diff --git a/docs/topic/ctc/img/ctc_loss_alpha_recurse.png b/docs/topic/ctc/img/ctc_loss_alpha_recurse.png new file mode 100644 index 00000000..de7555cb Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_alpha_recurse.png differ diff --git a/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png b/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png new file mode 100644 index 00000000..a8a67f04 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png differ diff --git a/docs/topic/ctc/img/ctc_loss_backward_1.png b/docs/topic/ctc/img/ctc_loss_backward_1.png new file mode 100644 index 00000000..558f8145 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_backward_1.png differ diff --git a/docs/topic/ctc/img/ctc_loss_backward_2.png b/docs/topic/ctc/img/ctc_loss_backward_2.png new file mode 100644 index 00000000..d81c060a Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_backward_2.png differ diff --git a/docs/topic/ctc/img/ctc_loss_backward_3.png b/docs/topic/ctc/img/ctc_loss_backward_3.png new file mode 100644 index 00000000..8486904d Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_backward_3.png differ diff --git a/docs/topic/ctc/img/ctc_loss_backward_recurse.png b/docs/topic/ctc/img/ctc_loss_backward_recurse.png new file mode 100644 index 00000000..4ee4bb19 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_backward_recurse.png differ diff --git a/docs/topic/ctc/img/ctc_loss_cat_lattice.png b/docs/topic/ctc/img/ctc_loss_cat_lattice.png new file mode 100644 index 00000000..969758ee Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_cat_lattice.png differ diff --git a/docs/topic/ctc/img/ctc_loss_forward_backward.png b/docs/topic/ctc/img/ctc_loss_forward_backward.png new file mode 100644 index 00000000..3194e502 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_forward_backward.png differ diff --git a/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png b/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png new file mode 100644 index 00000000..50136283 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png differ diff --git a/docs/topic/ctc/img/ctc_loss_forward_loss.png b/docs/topic/ctc/img/ctc_loss_forward_loss.png new file mode 100644 index 00000000..3941baeb Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_forward_loss.png differ diff --git a/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png b/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png new file mode 100644 index 00000000..024da319 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png differ diff --git a/docs/topic/ctc/img/ctc_loss_gradient_with_y.png b/docs/topic/ctc/img/ctc_loss_gradient_with_y.png new file mode 100644 index 00000000..8a99873f Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_gradient_with_y.png differ diff --git a/docs/topic/ctc/img/ctc_loss_prob_l_x.png b/docs/topic/ctc/img/ctc_loss_prob_l_x.png new file mode 100644 index 00000000..90fb6234 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_prob_l_x.png differ diff --git a/docs/topic/ctc/img/ctc_loss_prob_pi_x.png b/docs/topic/ctc/img/ctc_loss_prob_pi_x.png new file mode 100644 index 00000000..46af9775 Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_prob_pi_x.png differ diff --git a/docs/topic/ctc/img/ctc_loss_rescale_loss.png b/docs/topic/ctc/img/ctc_loss_rescale_loss.png new file mode 100644 index 00000000..aa761c8f Binary files /dev/null and b/docs/topic/ctc/img/ctc_loss_rescale_loss.png differ diff --git a/docs/tutorial/tts/source/fastpitch.png b/docs/tutorial/tts/source/fastpitch.png new file mode 100644 index 00000000..3e55b1e3 Binary files /dev/null and b/docs/tutorial/tts/source/fastpitch.png differ diff --git a/docs/tutorial/tts/source/fastspeech2.png b/docs/tutorial/tts/source/fastspeech2.png new file mode 100644 index 00000000..08ff611c Binary files /dev/null and b/docs/tutorial/tts/source/fastspeech2.png differ diff --git a/docs/tutorial/tts/source/frog_prince.jpg b/docs/tutorial/tts/source/frog_prince.jpg new file mode 100644 index 00000000..cdbac0c9 Binary files /dev/null and b/docs/tutorial/tts/source/frog_prince.jpg differ diff --git a/docs/tutorial/tts/source/ocr.wav b/docs/tutorial/tts/source/ocr.wav new file mode 100644 index 00000000..3a2ea19c Binary files /dev/null and b/docs/tutorial/tts/source/ocr.wav differ diff --git a/docs/tutorial/tts/source/ocr_result.jpg b/docs/tutorial/tts/source/ocr_result.jpg new file mode 100644 index 00000000..d55b6137 Binary files /dev/null and b/docs/tutorial/tts/source/ocr_result.jpg differ diff --git a/docs/tutorial/tts/source/pwgan.png b/docs/tutorial/tts/source/pwgan.png new file mode 100644 index 00000000..4dd2e58f Binary files /dev/null and b/docs/tutorial/tts/source/pwgan.png differ diff --git a/docs/tutorial/tts/source/signal_pipeline.png b/docs/tutorial/tts/source/signal_pipeline.png new file mode 100644 index 00000000..29284008 Binary files /dev/null and b/docs/tutorial/tts/source/signal_pipeline.png differ diff --git a/docs/tutorial/tts/source/text_frontend_struct.png b/docs/tutorial/tts/source/text_frontend_struct.png new file mode 100644 index 00000000..1f2c78a6 Binary files /dev/null and b/docs/tutorial/tts/source/text_frontend_struct.png differ diff --git a/docs/tutorial/tts/source/tts_lips.mp4 b/docs/tutorial/tts/source/tts_lips.mp4 new file mode 100644 index 00000000..4db45b7a Binary files /dev/null and b/docs/tutorial/tts/source/tts_lips.mp4 differ diff --git a/docs/tutorial/tts/source/tts_pipeline.png b/docs/tutorial/tts/source/tts_pipeline.png new file mode 100644 index 00000000..167dfe81 Binary files /dev/null and b/docs/tutorial/tts/source/tts_pipeline.png differ diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb new file mode 100644 index 00000000..2d617395 --- /dev/null +++ b/docs/tutorial/tts/tts_tutorial.ipynb @@ -0,0 +1,958 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Story Talker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 用 PaddleOCR 识别图片中的文字" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "img_path = 'source/frog_prince.jpg'\n", + "im = Image.open(img_path)\n", + "im.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 使用 TTS 合成的音频" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import IPython.display as dp\n", + "dp.Audio(\"source/ocr.wav\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "具体实现代码请参考: https://github.com/DeepSpeech/demos/story_talker/run.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 元宇宙来袭,构造你的虚拟人!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 使用 PaddleGAN 合成的唇形视频" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from IPython.display import HTML\n", + "html_str = '''\n", + "\n", + "'''.format(\"output/tts_lips.mp4\")\n", + "dp.display(HTML(html_str))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "具体实现代码请参考: https://github.com/DeepSpeech/demos/metaverse/run.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 前言\n", + "

\n", + "近年来,随着深度学习算法上的进步以及不断丰厚的硬件资源条件,**文本转语音(Text-To-Speech, TTS)** 技术在智能语音助手、虚拟娱乐等领域得到了广泛的应用。本教程将结合背景知识,让用户能够使用PaddlePaddle完成文本转语音任务,并结合光学字符识别(Optical Character Recognition,OCR)、自然语言处理(Natural Language Processing,NLP)等技术“听”书、让名人开口说话。\n", + "\n", + "

\n", + "## 背景知识\n", + "

\n", + "为了更好地了解文本转语音任务的要素,我们先简要地回顾一下文本转语音的发展历史。如果你对此已经有所了解,或希望能尽快使用代码实现,请跳至第二章。\n", + "\n", + "

\n", + "### 定义\n", + "

\n", + " \n", + " 文本转语音,又称语音合成(Speech Sysnthesis),指的是将一段文本按照一定需求转化成对应的音频,这种特性决定了的输出数据比输入输入长得多。文本转语音是一项包含了语义学、声学、数字信号处理以及机器学习的等多项学科的交叉任务。虽然辨识低质量音频文件的内容对人类来说很容易,但这对计算机来说并非易事。\n", + "\n", + "\n", + "> Note: 这里可以提供一下资料出处嘛? 2021/11/09\n", + "

\n", + "\n", + "按照不同的应用需求,更广义的语音合成研究包括:\n", + "- 语音转换(Voice Transformation/Conversion)\n", + " - 说话人转换\n", + " - 语音到歌唱转换(Speech to Singing)\n", + " - 语音情感转换\n", + " - 口音转换\n", + "- 歌唱合成 (Singing Synthesis)\n", + " - 歌词到歌唱转换(Text/Lyric to Singing)\n", + "- 可视语音合成(Visual Speech Synthesis)\n", + "\n", + "

\n", + "### 发展历史\n", + "

\n", + "\n", + "#### 机械式语音合成(19世纪及以前)\n", + "在第二次工业革命之前,语音的合成主要以机械式的音素合成为主。1779年,德裔丹麦科学家 Christian Gottlieb Kratzenstein 建造了人类的声道模型,使其可以产生五个长元音。1791年, Wolfgang von Kempelen 添加了唇和舌的模型,使其能够发出辅音和元音。\n", + "#### 电子语音合成(20世纪30年代)\n", + "贝尔实验室于20世纪30年代发明了声码器(Vocoder),将语音自动分解为音调和共振,此项技术由 Homer Dudley 改进为键盘式合成器并于 1939年纽约世界博览会展出。\n", + "#### 电子语音合成\n", + "第一台基于计算机的语音合成系统起源于 20 世纪 50 年代。1961 年,IBM 的 John Larry Kelly,以及 Louis Gerstman 使用 IBM 704 计算机合成语音,成为贝尔实验室最著名的成就之一。 1975年,第一代语音合成系统之一 —— MUSA(MUltichannel Speaking Automation)问世,其由一个独立的硬件和配套的软件组成。1978年发行的第二个版本也可以进行无伴奏演唱。90 年代的主流是采用 MIT 和贝尔实验室的系统,并结合自然语言处理模型。\n", + "> Note: 这里插一张timeline图\n", + "#### 当前的主流方法\n", + "\n", + "- 基于统计参数的语音合成\n", + " - 隐马尔可夫模型(Hidden Markov Model,HMM)\n", + " - 深度学习网络(Deep Neural Network,DNN)\n", + "- 波形拼接语音合成\n", + " \n", + "- 混合方法\n", + " - 参数轨迹指导的波形拼接\n", + "- 端到端神经网络语音合成\n", + " - 声学模型 + 声码器\n", + " - “完全”端到端方法\n", + "\n", + "

\n", + "## 基于深度学习的语音合成技术\n", + "

\n", + "### 语音合成基本知识\n", + "

\n", + "![信号处理流水线](source/signal_pipeline.png)\n", + "

\n", + "语音合成流水线包含 **文本前端(Text Frontend)****声学模型(Acoustic Model)****声码器(Vocoder)** 三个主要模块:\n", + "- 通过文本前端模块将原始文本转换为字符/音素。\n", + "- 通过声学模型将字符/音素转换为声学特征,如线性频谱图、mel 频谱图、LPC 特征等。\n", + "- 通过声码器将声学特征转换为波形。\n", + "

\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 实践\n", + "

\n", + "环境安装请参考: https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md\n", + "\n", + "

\n", + "\n", + "使用 **PaddleSpeech** 提供的预训练模型合成一句中文。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step 0 准备" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取预训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!mkdir download\n", + "!wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip\n", + "!unzip -d download download/pwg_baker_ckpt_0.4.zip\n", + "!wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip\n", + "!unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 查看预训练模型的结构" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!tree download/pwg_baker_ckpt_0.4\n", + "!tree download/fastspeech2_nosil_baker_ckpt_0.4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 导入 Python 包" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import logging\n", + "import sys\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "# PaddleSpeech 项目根目录放到 python 路径中\n", + "sys.path.insert(0,\"../../../\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import argparse\n", + "import os\n", + "from pathlib import Path\n", + "import IPython.display as dp\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import paddle\n", + "import soundfile as sf\n", + "import yaml\n", + "from paddlespeech.t2s.frontend.zh_frontend import Frontend\n", + "from paddlespeech.t2s.models.fastspeech2 import FastSpeech2\n", + "from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference\n", + "from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator\n", + "from paddlespeech.t2s.models.parallel_wavegan import PWGInference\n", + "from paddlespeech.t2s.modules.normalizer import ZScore\n", + "from yacs.config import CfgNode" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 设置预训练模型的路径" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "fastspeech2_config = \"download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml\"\n", + "fastspeech2_checkpoint = \"download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz\"\n", + "fastspeech2_stat = \"download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy\"\n", + "pwg_config = \"download/pwg_baker_ckpt_0.4/pwg_default.yaml\"\n", + "pwg_checkpoint = \"download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz\"\n", + "pwg_stat = \"download/pwg_baker_ckpt_0.4/pwg_stats.npy\"\n", + "phones_dict = \"download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt\"\n", + "# 读取 conf 文件并结构化\n", + "with open(fastspeech2_config) as f:\n", + " fastspeech2_config = CfgNode(yaml.safe_load(f))\n", + "with open(pwg_config) as f:\n", + " pwg_config = CfgNode(yaml.safe_load(f))\n", + "print(\"========Config========\")\n", + "print(fastspeech2_config)\n", + "print(\"---------------------\")\n", + "print(pwg_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step1 文本前端\n", + "

\n", + "\n", + "一个文本前端模块主要包含:\n", + "- 分段(Text Segmentation)\n", + "\n", + "- 文本正则化(Text Normalization, TN)\n", + "\n", + "- 分词(Word Segmentation, 主要是在中文中)\n", + "\n", + "- 词性标注(Part-of-Speech, PoS)\n", + "- 韵律预测(Prosody)\n", + "- 字音转换(Grapheme-to-Phoneme,G2P)\n", + "

\n", + "(Grapheme: **语言**书写系统的最小有意义单位; Phoneme: 区分单词的最小**语音**单位)\n", + " - 多音字(Polyphone)\n", + " - 变调(Tone Sandhi)\n", + " - “一”、“不”变调\n", + " - 三声变调\n", + " - 轻声变调\n", + " - 儿化音\n", + " - 方言\n", + "- ...\n", + "

\n", + "\n", + "(输入给声学模型之前,还需要把音素序列转换为 id)\n", + "\n", + "

\n", + "其中最重要的模块是 文本正则化 模块和 字音转换(TTS 中更常用 G2P代指) 模块。\n", + "\n", + "

\n", + "\n", + "各模块输出示例:\n", + "```text\n", + "• Text: 全国一共有112所211高校\n", + "• Text Normalization: 全国一共有一百一十二所二一一高校\n", + "• Word Segmentation: 全国/一共/有/一百一十二/所/二一一/高校/\n", + "• G2P(注意此句中“一”的读音):\n", + " quan2 guo2 yi2 gong4 you3 yi4 bai3 yi1 shi2 er4 suo3 er4 yao1 yao1 gao1 xiao4\n", + " (可以进一步把声母和韵母分开)\n", + " q uan2 g uo2 y i2 g ong4 y ou3 y i4 b ai3 y i1 sh i2 er4 s uo3 er4 y ao1 y ao1 g ao1 x iao4\n", + " (把音调和声韵母分开)\n", + " q uan g uo y i g ong y ou y i b ai y i sh i er s uo er y ao y ao g ao x iao\n", + " 0 2 0 2 0 2 0 4 0 3 ...\n", + "• Prosody (prosodic words #1, prosodic phrases #2, intonation phrases #3, sentence #4):\n", + " 全国#2一共有#2一百#1一十二所#2二一一#1高校#4\n", + " (分词的结果一般是固定的,但是不同人习惯不同,可能有不同的韵律)\n", + "```\n", + "\n", + "

\n", + "文本前端模块的设计需要融入很多专业的或经验性的知识,人类在读文本的时候可以自然而然地读出正确的发音,但是这些计算机都是不知道的!\n", + "\n", + "

\n", + "分词:\n", + "```text\n", + "我也想过过过儿过过的生活\n", + "我也想/过过/过儿/过过的/生活\n", + "\n", + "货拉拉拉不拉拉布拉多\n", + "货拉拉/拉不拉/拉布拉多\n", + "\n", + "南京市长江大桥\n", + "南京市长/江大桥\n", + "南京市/长江大桥\n", + "```\n", + "变调和儿化音:\n", + "```\n", + "你要不要和我们一起出去玩?\n", + "你要不(2声)要和我们一(4声)起出去玩(儿)?\n", + "\n", + "不好,我要一个人出去。\n", + "不(4声)好,我要一(2声)个人出去。\n", + "\n", + "(以下每个词的所有字都是三声的,请你读一读,体会一下在读的时候,是否每个字都被读成了三声?)\n", + "纸老虎、虎骨酒、展览馆、岂有此理、手表厂有五种好产品\n", + "```\n", + "多音字(通常需要先正确分词):\n", + "```text\n", + "人要行,干一行行一行,一行行行行行;\n", + "人要是不行,干一行不行一行,一行不行行行不行。\n", + "\n", + "佟大为妻子产下一女\n", + "\n", + "海水朝朝朝朝朝朝朝落\n", + "浮云长长长长长长长消\n", + "```\n", + "

\n", + "\n", + "PaddleSpeech TTS 文本前端解决方案:\n", + "- 文本正则: 规则\n", + "- G2P:\n", + " - 多音字模块: pypinyin/g2pM\n", + " - 变调模块: 用分词 + 规则\n", + "\n", + "

\n", + "相关 examples:\n", + " \n", + "https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tn\n", + "https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/g2p\n", + "\n", + "

\n", + "(未来计划推出基于深度学习的文本前端模块)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 构造文本前端对象" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n", + "frontend = Frontend(phone_vocab_path=phones_dict)\n", + "print(\"Frontend done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 调用文本前端" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "input = \"你好,欢迎使用百度飞桨框架进行深度学习研究!\"\n", + "# input = \"我每天中午12:00起床\"\n", + "# input = \"我出生于2005/11/08,那天的最低气温达到-10°C\"\n", + "# text norm 时会进行分句,merge_sentences 表示把分句的结果合成一条\n", + "# 可以把 merge_sentences 设置为 False, 多个子句并行调用声学模型和声码器提升合成速度\n", + "input_ids = frontend.get_input_ids(input, merge_sentences=True, print_info=True)\n", + "# 由于 merge_sentences=True, input_ids[\"phone_ids\"][0] 即表示整句的 phone_ids\n", + "phone_ids = input_ids[\"phone_ids\"][0]\n", + "print(\"phone_ids:\")\n", + "print(phone_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step1+ 文本前端深度学习化\n", + "

\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step2 声学模型\n", + "

\n", + "声学模型将字符/音素转换为声学特征,如线性频谱图、mel 频谱图、LPC 特征等,声学特征以 “帧” 为单位,一般一帧是 10ms 左右,一个音素一般对应 5~20 帧左右, 声学模型需要解决的是 “不等长序列间的映射问题”,“不等长”是指,同一个人发不同音素的持续时间不同,同一个人在不同时刻说同一句话的语速可能不同,对应各个音素的持续时间不同,不同人说话的特色不同,对应各个音素的持续时间不同。这是一个困难的“一对多”问题。\n", + "```\n", + "# 卡尔普陪外孙玩滑梯\n", + "000001|baker_corpus|sil 20 k 12 a2 4 er2 10 p 12 u3 12 p 9 ei2 9 uai4 15 s 11 uen1 12 uan2 14 h 10 ua2 11 t 15 i1 16 sil 20\n", + "```\n", + "\n", + "声学模型主要分为自回归模型和非自回归模型,其中自回归模型在 `t` 时刻的预测需要依赖 `t-1` 时刻的输出作为输入,预测时间长,但是音质相对较好,非自回归模型不存在预测上的依赖关系,预测时间快,音质相对较差。\n", + "\n", + "

\n", + "主流声学模型发展的脉络:\n", + "- 自回归模型:\n", + " - Tacotron\n", + " - Tacotron2\n", + " - Transformer TTS\n", + "- 非自回归模型:\n", + " - FastSpeech\n", + " - SpeedySpeech\n", + " - FastPitch\n", + " - FastSpeech2\n", + " - ...\n", + " \n", + "

\n", + "在本教程中,我们使用 `FastSpeech2` 作为声学模型。\n", + "![FastSpeech2](source/fastspeech2.png)\n", + "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n", + "![FastPitch](source/fastpitch.png)\n", + "更多关于声学模型的发展及改进的介绍: https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 初始化声学模型 FastSpeech2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "with open(phones_dict, \"r\") as f:\n", + " phn_id = [line.strip().split() for line in f.readlines()]\n", + "vocab_size = len(phn_id)\n", + "print(\"vocab_size:\", vocab_size)\n", + "odim = fastspeech2_config.n_mels\n", + "model = FastSpeech2(\n", + " idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n", + "# 预训练好的参数赋值给模型\n", + "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n", + "# 推理阶段不启用 batch norm 和 dropout\n", + "model.eval()\n", + "# 读取数据预处理阶段数据集的均值和标准差\n", + "stat = np.load(fastspeech2_stat)\n", + "mu, std = stat\n", + "mu = paddle.to_tensor(mu)\n", + "std = paddle.to_tensor(std)\n", + "fastspeech2_normalizer = ZScore(mu, std)\n", + "# 构造包含 normalize 的新模型\n", + "fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n", + "fastspeech2_inference.eval()\n", + "print(\"FastSpeech2 done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 调用声学模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "with paddle.no_grad():\n", + " mel = fastspeech2_inference(phone_ids)\n", + "print(\"shepe of mel (n_frames x n_mels):\")\n", + "print(mel.shape)\n", + "# 绘制声学模型输出的 mel 频谱\n", + "fig, ax = plt.subplots(figsize=(9, 6))\n", + "im = ax.imshow(mel.T, aspect='auto',origin='lower')\n", + "fig.colorbar(im, ax=ax)\n", + "plt.title('Mel Spectrogram')\n", + "plt.xlabel('Time')\n", + "plt.ylabel('Frequency')\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step3 声码器\n", + "

\n", + "声码器将声学特征转换为波形。声码器需要解决的是 “信息缺失的补全问题”。信息缺失是指,在音频波形转换为频谱图的时候,存在**相位信息**的缺失,在频谱图转换为 mel 频谱图的时候,存在**频域压缩**导致的信息缺失;假设音频的采样率是16kHZ, 一帧的音频有 10ms,也就是说,1s 的音频有 16000 个采样点,而 1s 中包含 100 帧,每一帧有 160 个采样点,声码器的作用就是将一个频谱帧变成音频波形的 160 个采样点,所以声码器中一般会包含**上采样**模块。\n", + " \n", + "

\n", + "与声学模型类似,声码器也分为自回归模型和非自回归模型, 更细致的分类如下:\n", + "\n", + "- Autoregression\n", + " - WaveNet\n", + " - WaveRNN\n", + " - LPCNet\n", + "- Flow\n", + " - WaveFlow\n", + " - WaveGlow\n", + " - FloWaveNet\n", + " - Parallel WaveNet\n", + "- GAN\n", + " - WaveGAN\n", + " - arallel WaveGAN\n", + " - MelGAN\n", + " - HiFi-GAN\n", + "- VAE\n", + " - Wave-VAE\n", + "- Diffusion\n", + " - WaveGrad\n", + " - DiffWave\n", + "\n", + "

\n", + "PaddleSpeech TTS 主要实现了百度的 `WaveFlow` 和一些主流的 GAN Vocoder, 在本教程中,我们使用 `Parallel WaveGAN` 作为声码器。\n", + "\n", + "

\n", + " \n", + "\n", + "

\n", + "各 GAN Vocoder 的生成器和判别器的 Loss 的区别如下表格所示:\n", + " \n", + "Model | Generator Loss |Discriminator Loss\n", + ":-------------:| :------------:| :-----\n", + "Parallel Wave GAN| adversial loss
Feature Matching | Multi-Scale Discriminator |\n", + "Mel GAN |adversial loss
Multi-resolution STFT loss | adversial loss|\n", + "Multi-Band Mel GAN | adversial loss
full band Multi-resolution STFT loss
sub band Multi-resolution STFT loss |Multi-Scale Discriminator|\n", + "HiFi GAN |adversial loss
Feature Matching
Mel-Spectrogram Loss | Multi-Scale Discriminator
Multi-Period Discriminator|\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 初始化声码器 Parallel WaveGAN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n", + "# 预训练好的参数赋值给模型\n", + "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"])\n", + "vocoder.remove_weight_norm()\n", + "# 推理阶段不启用 batch norm 和 dropout\n", + "vocoder.eval()\n", + "# 读取数据预处理阶段数据集的均值和标准差\n", + "stat = np.load(pwg_stat)\n", + "mu, std = stat\n", + "mu = paddle.to_tensor(mu)\n", + "std = paddle.to_tensor(std)\n", + "pwg_normalizer = ZScore(mu, std)\n", + "# 构造包含 normalize 的新模型\n", + "pwg_inference = PWGInference(pwg_normalizer, vocoder)\n", + "pwg_inference.eval()\n", + "print(\"Parallel WaveGAN done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 调用声码器" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "with paddle.no_grad():\n", + " wav = pwg_inference(mel)\n", + "print(\"shepe of wav (time x n_channels):\")\n", + "print(wav.shape)\n", + "# 绘制声码器输出的波形图\n", + "wave_data = wav.numpy().T\n", + "time = np.arange(0, wave_data.shape[1]) * (1.0 / fastspeech2_config.fs)\n", + "fig, ax = plt.subplots(figsize=(9, 6))\n", + "plt.plot(time, wave_data[0])\n", + "plt.title('Waveform')\n", + "plt.xlabel('Time (seconds)')\n", + "plt.ylabel('Amplitude (normed)')\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 播放音频" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dp.Audio(wav.numpy().T, rate=fastspeech2_config.fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 保存音频" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!mkdir output\n", + "sf.write(\n", + " \"output/output.wav\",\n", + " wav.numpy(),\n", + " samplerate=fastspeech2_config.fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step4 FastSpeech2 进阶 —— 个性化调节\n", + "

\n", + "FastSpeech2 模型可以个性化地调节音素时长、音调和能量,通过一些简单的调节就可以获得一些有意思的效果" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 不要听信别人的谗言,我不是什么克隆人。\n", + "print(\"原始音频\")\n", + "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_001.wav\"))\n", + "print(\"speed x 1.2\")\n", + "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_001.wav\"))\n", + "print(\"speed x 0.8\")\n", + "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_001.wav\"))\n", + "print(\"pitch x 1.3(童声)\")\n", + "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/001.wav\"))\n", + "print(\"robot\")\n", + "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/001.wav\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "具体实现代码请参考: https://github.com/DeepSpeech/demos/style_fs2/run.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

\n", + "# 用 PaddleSpeech 训练 TTS 模型\n", + "

\n", + "PaddleSpeech 的 examples 是按照 数据集/模型 的结构安排的:\n", + "```text\n", + "examples \n", + "|-- aishell3\n", + "| |-- README.md\n", + "| |-- tts3\n", + "| `-- vc0\n", + "|-- csmsc\n", + "| |-- README.md\n", + "| |-- tts2\n", + "| |-- tts3\n", + "| |-- voc1\n", + "| `-- voc3\n", + "```\n", + "我们在每个数据集的 README.md 介绍了子目录和模型的对应关系, 在 TTS 中有如下对应关系:\n", + "```text\n", + "tts0 - Tactron2\n", + "tts1 - TransformerTTS\n", + "tts2 - SpeedySpeech\n", + "tts3 - FastSpeech2\n", + "voc0 - WaveFlow\n", + "voc1 - Parallel WaveGAN\n", + "voc2 - MelGAN\n", + "voc3 - MultiBand MelGAN\n", + "```\n", + "

\n", + "## 基于 CSMCS 数据集训练 FastSpeech2 模型\n", + "```bash\n", + "git clone https://github.com/PaddlePaddle/PaddleSpeech.git\n", + "cd examples/csmsc/tts\n", + "```\n", + "根据 README.md, 下载 CSMCS 数据集和其对应的强制对齐文件, 并放置在对应的位置\n", + "```bash\n", + "./run.sh\n", + "```\n", + "`run.sh` 中包含预处理、训练、合成、静态图推理等步骤:\n", + "\n", + "```bash\n", + "#!/bin/bash\n", + "set -e\n", + "source path.sh\n", + "gpus=0,1\n", + "stage=0\n", + "stop_stage=100\n", + "conf_path=conf/default.yaml\n", + "train_output_path=exp/default\n", + "ckpt_name=snapshot_iter_153.pdz\n", + "\n", + "# with the following command, you can choice the stage range you want to run\n", + "# such as `./run.sh --stage 0 --stop-stage 0`\n", + "# this can not be mixed use with `$1`, `$2` ...\n", + "source ${MAIN_ROOT}/utils/parse_options.sh || exit 1\n", + "\n", + "if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then\n", + " # prepare data\n", + " bash ./local/preprocess.sh ${conf_path} || exit -1\n", + "fi\n", + "if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then\n", + " # train model, all `ckpt` under `train_output_path/checkpoints/` dir\n", + " CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1\n", + "fi\n", + "if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then\n", + " # synthesize, vocoder is pwgan\n", + " CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n", + "fi\n", + "if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then\n", + " # synthesize_e2e, vocoder is pwgan\n", + " CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n", + "fi\n", + "if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then\n", + " # inference with static model\n", + " CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1\n", + "fi\n", + "```\n", + "

\n", + "## 基于 CSMCS 数据集训练 Parallel WaveGAN 模型\n", + "```bash\n", + "git clone https://github.com/PaddlePaddle/PaddleSpeech.git\n", + "cd examples/csmsc/voc1\n", + "```\n", + "根据 README.md, 下载 CSMCS 数据集和其对应的强制对齐文件, 并放置在对应的位置\n", + "```bash\n", + "./run.sh\n", + "```\n", + "`run.sh` 中包含预处理、训练、合成等步骤:\n", + "```bash\n", + "#!/bin/bash\n", + "set -e\n", + "source path.sh\n", + "gpus=0,1\n", + "stage=0\n", + "stop_stage=100\n", + "conf_path=conf/default.yaml\n", + "train_output_path=exp/default\n", + "ckpt_name=snapshot_iter_5000.pdz\n", + "\n", + "# with the following command, you can choice the stage range you want to run\n", + "# such as `./run.sh --stage 0 --stop-stage 0`\n", + "# this can not be mixed use with `$1`, `$2` ...\n", + "source ${MAIN_ROOT}/utils/parse_options.sh || exit 1\n", + "\n", + "if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then\n", + " # prepare data\n", + " ./local/preprocess.sh ${conf_path} || exit -1\n", + "fi\n", + "if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then\n", + " # train model, all `ckpt` under `train_output_path/checkpoints/` dir\n", + " CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1\n", + "fi\n", + "if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then\n", + " # synthesize\n", + " CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n", + "fi\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FAQ\n", + "\n", + "- 需要注意的问题\n", + "- 经验与分享\n", + "- 用户的其他问题" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 作业\n", + "在 CSMSC 数据集上利用 FastSpeech2 和 Parallel WaveGAN 实现一个中文 TTS 系统" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 关注 PaddleSpeech\n", + "https://github.com/PaddlePaddle/PaddleSpeech/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.0 64-bit ('yt_py37_develop': venv)", + "language": "python", + "name": "python37064bitytpy37developvenv88cd689abeac41d886f9210a708a170b" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "263.594px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml index c15e71a3..29ec2379 100644 --- a/examples/aishell/s0/conf/deepspeech2_online.yaml +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -46,10 +46,10 @@ model: ctc_grad_norm_type: null training: - n_epoch: 50 + n_epoch: 65 accum_grad: 1 - lr: 2e-3 - lr_decay: 0.9 # 0.83 + lr: 5e-4 + lr_decay: 0.93 weight_decay: 1e-06 global_grad_clip: 3.0 log_interval: 100 @@ -63,7 +63,7 @@ decoding: decoding_method: ctc_beam_search lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm alpha: 2.2 #1.9 - beta: 5.0 + beta: 4.3 beam_size: 300 cutoff_prob: 0.99 cutoff_top_n: 40 diff --git a/examples/aishell/s0/local/download_lm_ch.sh b/examples/aishell/s0/local/download_lm_ch.sh index ac27a907..7d20297f 100755 --- a/examples/aishell/s0/local/download_lm_ch.sh +++ b/examples/aishell/s0/local/download_lm_ch.sh @@ -9,12 +9,13 @@ URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm' MD5="29e02312deb2e59b3c8686c7966d4fe3" TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm - -echo "Download language model ..." -download $URL $MD5 $TARGET +echo "Start downloading the language model. The language model is large, please wait for a moment ..." +download $URL $MD5 $TARGET > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Fail to download the language model!" exit 1 +else + echo "Download the language model sucessfully" fi diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/s0/local/test.sh index d539ac49..2ae0740b 100755 --- a/examples/aishell/s0/local/test.sh +++ b/examples/aishell/s0/local/test.sh @@ -13,7 +13,7 @@ ckpt_prefix=$2 model_type=$3 # download language model -bash local/download_lm_ch.sh > /dev/null 2>&1 +bash local/download_lm_ch.sh if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index 83846ada..59b7bfe3 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -5,9 +5,9 @@ source path.sh gpus=0,1,2,3 stage=0 stop_stage=100 -conf_path=conf/deepspeech2.yaml +conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml avg_num=1 -model_type=offline +model_type=offline # offline or online source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 94c2c4df..0b40e064 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -2,6 +2,7 @@ source path.sh set -e +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/conformer.yaml @@ -22,7 +23,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -40,18 +41,19 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi + +# Optionally, you can add LM and test it with runtime. +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi - # Optionally, you can add LM and test it with runtime. - if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + echo "warning: deps on kaldi and srilm, please make sure installed." # train lm and build TLG ./local/tlg.sh --corpus aishell --lmtype srilm - fi - -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index c313d922..82b69ad8 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3 ``` ### Get MFA result of AISHELL-3 and Extract it We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. -You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. +You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. @@ -67,8 +67,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -81,8 +81,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -92,23 +91,22 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. -7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. +6. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2. ### Synthesize -We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. ```bash -unzip pwg_baker_ckpt_0.4.zip +unzip pwg_aishell3_ckpt_0.5.zip ``` Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_baker_ckpt_0.4 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_aishell3_ckpt_0.5 +├── default.yaml # default config used to train parallel wavegan +├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan ``` `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash @@ -122,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -149,8 +147,8 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --verbose VERBOSE verbose. + --ngpu NGPU if ngpu == 0, use cpu. + --verbose VERBOSE verbose ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file. ```bash @@ -166,7 +164,7 @@ usage: multi_spk_synthesize_e2e.py [-h] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--text TEXT] - [--output-dir OUTPUT_DIR] [--device DEVICE] + [--output-dir OUTPUT_DIR] [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -193,7 +191,7 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` 1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model. @@ -201,7 +199,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) @@ -226,15 +224,12 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \ --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \ --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt ``` -## Future work -A multi-speaker vocoder is needed. diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 90816e7d..1dd782db 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction. # DATA SETTING # ########################################################### batch_size: 64 -num_workers: 4 +num_workers: 2 ########################################################### diff --git a/examples/aishell3/tts3/local/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh index 64361983..db079410 100755 --- a/examples/aishell3/tts3/local/synthesize.sh +++ b/examples/aishell3/tts3/local/synthesize.sh @@ -10,11 +10,10 @@ python3 ${BIN_DIR}/synthesize.py \ --fastspeech2-config=${config_path} \ --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh index 8a979844..1a6e47e7 100755 --- a/examples/aishell3/tts3/local/synthesize_e2e.sh +++ b/examples/aishell3/tts3/local/synthesize_e2e.sh @@ -10,11 +10,10 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ --fastspeech2-config=${config_path} \ --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/local/train.sh b/examples/aishell3/tts3/local/train.sh index be6051c9..1da72f11 100755 --- a/examples/aishell3/tts3/local/train.sh +++ b/examples/aishell3/tts3/local/train.sh @@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=2 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index 65671076..95e4d38f 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -7,7 +7,6 @@ gpus=0,1 stage=0 stop_stage=100 - conf_path=conf/default.yaml train_output_path=exp/default ckpt_name=snapshot_iter_482.pdz diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 9a269ed5..28fea629 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -1,8 +1,8 @@ # Tacotron2 + AISHELL-3 Voice Cloning This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows: -1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e). +1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). 2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs. -3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0). +3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0). ## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. @@ -28,7 +28,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/../ge2e/inference.py \ --input=${input} \ --output=${preprocess_path}/embed \ - --device="gpu" \ + --ngpu=1 \ --checkpoint_path=${ge2e_ckpt_path} fi ``` @@ -39,9 +39,9 @@ There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is ve We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`. -We use [lexicon.txt](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. +We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. -You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. +You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. ```bash if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh index 87cfab32..5bf88066 100755 --- a/examples/aishell3/vc0/local/preprocess.sh +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -9,10 +9,9 @@ alignment=$3 ge2e_ckpt_path=$4 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${BIN_DIR}/../../ge2e/inference.py \ - --input=${input} \ + python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \ + --input=${input}/wav \ --output=${preprocess_path}/embed \ - --device="gpu" \ --checkpoint_path=${ge2e_ckpt_path} fi diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh index eb968b5f..f062cbbf 100755 --- a/examples/aishell3/vc0/local/train.sh +++ b/examples/aishell3/vc0/local/train.sh @@ -6,4 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device="gpu" \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md new file mode 100644 index 00000000..8c0aec3a --- /dev/null +++ b/examples/aishell3/vc1/README.md @@ -0,0 +1,89 @@ +# FastSpeech2 + AISHELL-3 Voice Cloning +This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows: +1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). +2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs. +3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0). + +## Get Started +Assume the path to the dataset is `~/datasets/data_aishell3`. +Assume the path to the MFA result of AISHELL-3 is `./alignment`. +Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000` +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. start a voice cloning inference. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} +``` +#### generate utterance embedding + Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. + +```bash +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ge2e/inference.py \ + --input=${input} \ + --output=${preprocess_path}/embed \ + --ngpu=1 \ + --checkpoint_path=${ge2e_ckpt_path} +fi +``` + +The computing time of utterance embedding can be x hours. +#### process wav +There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence. + +We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`. + +We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. + +You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. + +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Process wav ..." + python3 ${BIN_DIR}/process_wav.py \ + --input=${input}/wav \ + --output=${preprocess_path}/normalized_wav \ + --alignment=${alignment} +fi +``` + +#### preprocess transcription +We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels. + +```bash +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/preprocess_transcription.py \ + --input=${input} \ + --output=${preprocess_path} +fi +``` +The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading. +#### extract mel +```python +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/extract_mel.py \ + --input=${preprocess_path}/normalized_wav \ + --output=${preprocess_path}/mel +fi +``` + +### Train the model +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} +``` + +Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition. + +In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster. +### Infernece +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} +``` +## Pretrained Model +[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml new file mode 100644 index 00000000..bdd2a765 --- /dev/null +++ b/examples/aishell3/vc1/conf/default.yaml @@ -0,0 +1,105 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size. +n_shift: 300 # Hop size. +win_length: 1200 # Window length. + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + use_masking: True # whether to apply masking for padded part in loss calculation + use_scaled_pos_enc: True # whether to use scaled positional encoding + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + init_type: xavier_uniform # initialization type + init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding + init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # speaker embedding integration type + + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 200 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/aishell3/vc1/local/preprocess.sh b/examples/aishell3/vc1/local/preprocess.sh new file mode 100755 index 00000000..5f939a1a --- /dev/null +++ b/examples/aishell3/vc1/local/preprocess.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 +ge2e_ckpt_path=$2 + +# gen speaker embedding +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \ + --input=~/datasets/data_aishell3/train/wav/ \ + --output=dump/embed \ + --checkpoint_path=${ge2e_ckpt_path} +fi + +# copy from tts3/preprocess +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./aishell3_alignment_tone \ + --output durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=aishell3 \ + --rootdir=~/datasets/data_aishell3/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True \ + --spk_emb_dir=dump/embed +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="pitch" + + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="energy" +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # normalize and covert phone/speaker to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/aishell3/vc1/local/synthesize.sh b/examples/aishell3/vc1/local/synthesize.sh new file mode 100755 index 00000000..35478c78 --- /dev/null +++ b/examples/aishell3/vc1/local/synthesize.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test \ + --phones-dict=dump/phone_id_map.txt \ + --voice-cloning=True diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh new file mode 100755 index 00000000..c775fcad --- /dev/null +++ b/examples/aishell3/vc1/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=2 \ + --phones-dict=dump/phone_id_map.txt \ + --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc1/local/voice_cloning.sh b/examples/aishell3/vc1/local/voice_cloning.sh new file mode 100755 index 00000000..55bdd761 --- /dev/null +++ b/examples/aishell3/vc1/local/voice_cloning.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +ge2e_params_path=$4 +ref_audio_dir=$5 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/voice_cloning.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ + --ge2e_params_path=${ge2e_params_path} \ + --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ + --input-dir=${ref_audio_dir} \ + --output-dir=${train_output_path}/vc_syn \ + --phones-dict=dump/phone_id_map.txt diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh new file mode 100755 index 00000000..fb7e8411 --- /dev/null +++ b/examples/aishell3/vc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/run.sh b/examples/aishell3/vc1/run.sh new file mode 100755 index 00000000..4eae1bdd --- /dev/null +++ b/examples/aishell3/vc1/run.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_482.pdz +ref_audio_dir=ref_audio + +# not include ".pdparams" here +ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 + +# include ".pdparams" here +ge2e_params_path=${ge2e_ckpt_path}.pdparams + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1 +fi diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md new file mode 100644 index 00000000..d9e8ce59 --- /dev/null +++ b/examples/aishell3/voc1/README.md @@ -0,0 +1,146 @@ +# Parallel WaveGAN with AISHELL-3 +This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). + +AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems. +## Dataset +### Download and Extract the datasaet +Download AISHELL-3. +```bash +wget https://www.openslr.org/resources/93/data_aishell3.tgz +``` +Extract AISHELL-3. +```bash +mkdir data_aishell3 +tar zxvf data_aishell3.tgz -C data_aishell3 +``` +### Get MFA result of AISHELL-3 and Extract it +We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. +You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/data_aishell3`. +Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── feats_stats.npy +``` + +The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. + +### Train the model +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] + +Train a ParallelWaveGAN model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --verbose VERBOSE verbose. + +benchmark: + arguments related to benchmark. + + --batch-size BATCH_SIZE + batch size. + --max-iter MAX_ITER train max steps. + --run-benchmark RUN_BENCHMARK + runing benchmark or not, if True, use the --batch-size + and --max-iter. + --profiler_options PROFILER_OPTIONS + The option of profiler, which should be in format + "key1=value1;key2=value2;key3=value3". +``` + +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] + +Synthesize with parallel wavegan. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG parallel wavegan config file. + --checkpoint CHECKPOINT + snapshot to load. + --test-metadata TEST_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --verbose VERBOSE verbose. +``` + +1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `snapshot_iter_1000000.pdz `. +3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. +4. `--output-dir` is the directory to save the synthesized audio files. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip). + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_aishell3_ckpt_0.5 +├── default.yaml # default config used to train parallel wavegan +├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan +``` +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml new file mode 100644 index 00000000..ba2d9f2e --- /dev/null +++ b/examples/aishell3/voc1/conf/default.yaml @@ -0,0 +1,115 @@ +# This is the hyperparameter configuration file for Parallel WaveGAN. +# Please make sure this is adjusted for the VCTK corpus. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size. (in samples) +n_shift: 300 # Hop size. (in samples) +win_length: 1200 # Window length. (in samples) + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Kernel size of dilated convolution. + layers: 30 # Number of residual block layers. + stacks: 3 # Number of stacks i.e., dilation cycles. + residual_channels: 64 # Number of channels in residual conv. + gate_channels: 128 # Number of channels in gated conv. + skip_channels: 64 # Number of channels in skip conv. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Number of output channels. + layers: 10 # Number of conv layers. + conv_channels: 64 # Number of chnn layers. + bias: true # Whether to use bias parameter in conv. + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. + nonlinear_activation_params: # Nonlinear function parameters + negative_slope: 0.2 # Alpha in LeakyReLU. + +########################################################### +# STFT LOSS SETTING # +########################################################### +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 4.0 # Loss balancing coefficient. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 8 # Batch size. +batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: true # Whether to pin memory in Pytorch DataLoader. +num_workers: 4 # Number of workers in Pytorch DataLoader. +remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. +allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-6 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 0.0001 # Generator's learning rate. + step_size: 200000 # Generator's scheduler step size. + gamma: 0.5 # Generator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +generator_grad_norm: 10 # Generator's gradient norm. +discriminator_optimizer_params: + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. +train_max_steps: 1000000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh new file mode 100755 index 00000000..44cc3dbe --- /dev/null +++ b/examples/aishell3/voc1/local/preprocess.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./aishell3_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../preprocess.py \ + --rootdir=~/datasets/data_aishell3/ \ + --dataset=aishell3 \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --cut-sil=True \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh new file mode 100755 index 00000000..9f904ac0 --- /dev/null +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh new file mode 100755 index 00000000..9695631e --- /dev/null +++ b/examples/aishell3/voc1/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +FLAGS_cudnn_exhaustive_search=true \ +FLAGS_conv_workspace_size_limit=4000 \ +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh new file mode 100755 index 00000000..1e6647b8 --- /dev/null +++ b/examples/aishell3/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/aishell3/voc1/run.sh b/examples/aishell3/voc1/run.sh new file mode 100755 index 00000000..7d0fdb21 --- /dev/null +++ b/examples/aishell3/voc1/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index 305021f1..e9be3d03 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -2,6 +2,7 @@ set -e source path.sh +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/conformer.yaml @@ -20,7 +21,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -30,7 +31,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=4 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index de9e488c..2088ed15 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -1,5 +1,5 @@ -# Speedyspeech with CSMSC -This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner). +# SpeedySpeech with CSMSC +This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner). ## Dataset ### Download and Extract the datasaet @@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind ### Get MFA result of CSMSC and Extract it We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -55,10 +55,10 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--use-relative-path USE_RELATIVE_PATH] - [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] + [--use-relative-path USE_RELATIVE_PATH] + [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] Train a Speedyspeech model with sigle speaker dataset. @@ -71,8 +71,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. --use-relative-path USE_RELATIVE_PATH whether use relative path in metadata @@ -85,13 +84,12 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. -7. `--tones-dict` is the path of the tone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. +6. `--tones-dict` is the path of the tone vocabulary file. ### Synthesize -We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash unzip pwg_baker_ckpt_0.4.zip @@ -115,7 +113,7 @@ usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--inference-dir INFERENCE_DIR] [--device DEVICE] + [--inference-dir INFERENCE_DIR] [--ngpu NGPU] [--verbose VERBOSE] Synthesize with speedyspeech & parallel wavegan. @@ -145,7 +143,7 @@ optional arguments: output dir --inference-dir INFERENCE_DIR dir to save inference models - --device DEVICE device type to use + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. @@ -161,8 +159,8 @@ usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] [--pwg-stat PWG_STAT] [--text TEXT] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] [--output-dir OUTPUT_DIR] - [--inference-dir INFERENCE_DIR] [--device DEVICE] - [--verbose VERBOSE] + [--inference-dir INFERENCE_DIR] [--verbose VERBOSE] + [--ngpu NGPU] Synthesize with speedyspeech & parallel wavegan. @@ -190,15 +188,15 @@ optional arguments: output dir --inference-dir INFERENCE_DIR dir to save inference models - --device DEVICE device type to use --verbose VERBOSE verbose + --ngpu NGPU if ngpu == 0, use cpu. ``` 1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model. 2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for parallel wavegan, which correspond to the 3 files in the parallel wavegan pretrained model. 3. `--text` is the text file, which contains sentences to synthesize. 4. `--output-dir` is the directory to save synthesized audio files. 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece. -6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 7. `--phones-dict` is the path of the phone vocabulary file. 8. `--tones-dict` is the path of the tone vocabulary file. @@ -211,6 +209,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ## Pretrained Model Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) +Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip). SpeedySpeech checkpoint contains files listed below. ```text @@ -237,7 +236,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ - --device="gpu" \ --phones-dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \ --tones-dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt ``` diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh index 418ee02e..8be02dfb 100755 --- a/examples/csmsc/tts2/local/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -16,5 +16,4 @@ python3 ${BIN_DIR}/synthesize.py \ --output-dir=${train_output_path}/test \ --inference-dir=${train_output_path}/inference \ --phones-dict=dump/phone_id_map.txt \ - --tones-dict=dump/tone_id_map.txt \ - --device="gpu" + --tones-dict=dump/tone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index c50fa776..3cbc7936 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -16,6 +16,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=${train_output_path}/test_e2e \ --inference-dir=${train_output_path}/inference \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/tts2/local/train.sh b/examples/csmsc/tts2/local/train.sh index e44c7da5..f0a5a683 100755 --- a/examples/csmsc/tts2/local/train.sh +++ b/examples/csmsc/tts2/local/train.sh @@ -9,7 +9,7 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 7eeb14fc..6e4701df 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind ### Get MFA result of CSMSC and Extract it We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -59,8 +59,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -73,8 +73,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -84,12 +83,11 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ### Synthesize -We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash unzip pwg_baker_ckpt_0.4.zip @@ -113,7 +111,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -140,7 +138,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. @@ -155,7 +153,8 @@ usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--inference-dir INFERENCE_DIR] [--ngpu NGPU] + [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -179,7 +178,9 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --inference-dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -188,7 +189,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Inference After Synthesize, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`. @@ -199,6 +200,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) +Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip) FastSpeech2 checkpoint contains files listed below. ```text @@ -224,6 +226,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt ``` diff --git a/examples/csmsc/tts3/local/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh index 724afb04..e525fc16 100755 --- a/examples/csmsc/tts3/local/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index b6542743..cc27ffb6 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -16,5 +16,4 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=${train_output_path}/test_e2e \ --inference-dir=${train_output_path}/inference \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts3/local/train.sh b/examples/csmsc/tts3/local/train.sh index fbbc9a9d..f90db915 100755 --- a/examples/csmsc/tts3/local/train.sh +++ b/examples/csmsc/tts3/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 4b6b6c42..f789cba0 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA results for silence trim We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -53,9 +53,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -69,8 +68,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. benchmark: @@ -90,8 +88,7 @@ benchmark: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. @@ -101,7 +98,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -114,7 +111,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -122,10 +119,11 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). +Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). +Static models can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 5628b7f7..1363b454 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. batch_size: 8 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. diff --git a/examples/csmsc/voc1/local/train.sh b/examples/csmsc/voc1/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/csmsc/voc1/local/train.sh +++ b/examples/csmsc/voc1/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 780a8ccd..9cb9d34d 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -53,12 +53,9 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] [--verbose VERBOSE] -Train a ParallelWaveGAN model. +Train a Multi-Band MelGAN model. optional arguments: -h, --help show this help message and exit @@ -69,29 +66,14 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. @@ -101,27 +83,40 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] -Synthesize with parallel wavegan. +Synthesize with multi band melgan. optional arguments: -h, --help show this help message and exit - --config CONFIG parallel wavegan config file. + --config CONFIG multi band melgan config file. --checkpoint CHECKPOINT snapshot to load. --test-metadata TEST_METADATA dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` -1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. +1. `--config` multi band melgan config file. You should use the same config with which the model is trained. 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models +Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip). +Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) + +Multi Band MelGAN checkpoint contains files listed below. + +```text +mb_melgan_baker_ckpt_0.5 +├── default.yaml # default config used to train multi band melgan +├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan +└── snapshot_iter_1000000.pdz # generator parameters of multi band melgan +``` +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml new file mode 100644 index 00000000..e02f3e22 --- /dev/null +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -0,0 +1,139 @@ +# This is the hyperparameter configuration file for MelGAN. +# Please make sure this is adjusted for the CSMSC dataset. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V. + +# This configuration is based on full-band MelGAN but the hop size and sampling +# rate is different from the paper (16kHz vs 24kHz). The number of iteraions +# is not shown in the paper so currently we train 1M iterations (not sure enough +# to converge). The optimizer setting is based on @dathudeptrai advice. +# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size. (in samples) +n_shift: 300 # Hop size. (in samples) +win_length: 1200 # Window length. (in samples) + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 80 # Number of input channels. + out_channels: 4 # Number of output channels. + kernel_size: 7 # Kernel size of initial and final conv layers. + channels: 384 # Initial number of channels for conv layers. + upsample_scales: [5, 5, 3] # List of Upsampling scales. + stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack. + stacks: 4 # Number of stacks in a single residual stack module. + use_weight_norm: True # Whether to use weight normalization. + use_causal_conv: False # Whether to use causal convolution. + use_final_nonlinear_activation: True + + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + scales: 3 # Number of multi-scales. + downsample_pooling: "AvgPool1D" # Pooling type for the input downsampling. + downsample_pooling_params: # Parameters of the above pooling function. + kernel_size: 4 + stride: 2 + padding: 1 + exclusive: True + kernel_sizes: [5, 3] # List of kernel size. + channels: 16 # Number of channels of the initial conv layer. + max_downsample_channels: 512 # Maximum number of channels of downsampling layers. + downsample_scales: [4, 4, 4] # List of downsampling scales. + nonlinear_activation: "LeakyReLU" # Nonlinear activation function. + nonlinear_activation_params: # Parameters of nonlinear activation function. + negative_slope: 0.2 + use_weight_norm: True # Whether to use weight norm. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: true +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss +use_subband_stft_loss: true +subband_stft_loss_params: + fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. + hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss + win_lengths: [150, 300, 60] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +use_feat_match_loss: false # Whether to use feature matching loss. +lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 64 # Batch size. +batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 2 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-7 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. + +generator_grad_norm: -1 # Generator's gradient norm. +generator_scheduler_params: + learning_rate: 1.0e-3 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 100000 + - 200000 + - 300000 + - 400000 + - 500000 + - 600000 +discriminator_optimizer_params: + epsilon: 1.0e-7 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. + +discriminator_grad_norm: -1 # Discriminator's gradient norm. +discriminator_scheduler_params: + learning_rate: 1.0e-3 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 100000 + - 200000 + - 300000 + - 400000 + - 500000 + - 600000 + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator. +train_max_steps: 1200000 # Number of training steps. +save_interval_steps: 1000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random \ No newline at end of file diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh new file mode 100755 index 00000000..42e5a397 --- /dev/null +++ b/examples/csmsc/voc3/finetune.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ + --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --dur-file=durations.txt \ + --output-dir=dump_finetune \ + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 local/link_wav.py \ + --old-dump-dir=dump \ + --dump-dir=dump_finetune + +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + cp dump/train/feats_stats.npy dump_finetune/train/ +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/train/raw/metadata.jsonl \ + --dumpdir=dump_finetune/train/norm \ + --stats=dump_finetune/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/dev/raw/metadata.jsonl \ + --dumpdir=dump_finetune/dev/norm \ + --stats=dump_finetune/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/test/raw/metadata.jsonl \ + --dumpdir=dump_finetune/test/norm \ + --stats=dump_finetune/train/feats_stats.npy +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + CUDA_VISIBLE_DEVICES=${gpus} \ + FLAGS_cudnn_exhaustive_search=true \ + FLAGS_conv_workspace_size_limit=4000 \ + python ${BIN_DIR}/train.py \ + --train-metadata=dump_finetune/train/norm/metadata.jsonl \ + --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ + --config=conf/finetune.yaml \ + --output-dir=exp/finetune \ + --ngpu=1 +fi \ No newline at end of file diff --git a/examples/csmsc/voc3/local/link_wav.py b/examples/csmsc/voc3/local/link_wav.py new file mode 100644 index 00000000..c81e0d4b --- /dev/null +++ b/examples/csmsc/voc3/local/link_wav.py @@ -0,0 +1,85 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from operator import itemgetter +from pathlib import Path + +import jsonlines +import numpy as np + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features .") + + parser.add_argument( + "--old-dump-dir", + default=None, + type=str, + help="directory to dump feature files.") + parser.add_argument( + "--dump-dir", + type=str, + required=True, + help="directory to finetune dump feature files.") + args = parser.parse_args() + + old_dump_dir = Path(args.old_dump_dir).expanduser() + old_dump_dir = old_dump_dir.resolve() + dump_dir = Path(args.dump_dir).expanduser() + # use absolute path + dump_dir = dump_dir.resolve() + dump_dir.mkdir(parents=True, exist_ok=True) + + assert old_dump_dir.is_dir() + assert dump_dir.is_dir() + + for sub in ["train", "dev", "test"]: + # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置 + output_dir = dump_dir / sub + output_dir.mkdir(parents=True, exist_ok=True) + results = [] + for name in os.listdir(output_dir / "raw"): + # 003918_feats.npy + utt_id = name.split("_")[0] + mel_path = output_dir / ("raw/" + name) + gen_mel = np.load(mel_path) + wave_name = utt_id + "_wave.npy" + wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) + os.symlink(old_dump_dir / sub / ("raw/" + wave_name), + output_dir / ("raw/" + wave_name)) + num_sample = wav.shape[0] + num_frames = gen_mel.shape[0] + wav_path = output_dir / ("raw/" + wave_name) + + record = { + "utt_id": utt_id, + "num_samples": num_sample, + "num_frames": num_frames, + "feats": str(mel_path), + "wave": str(wav_path), + } + results.append(record) + + results.sort(key=itemgetter("utt_id")) + + with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + + +if __name__ == "__main__": + main() diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/csmsc/voc3/local/train.sh +++ b/examples/csmsc/voc3/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/examples/librispeech/s0/local/download_lm_en.sh b/examples/librispeech/s0/local/download_lm_en.sh index dc1bdf66..390fffc9 100755 --- a/examples/librispeech/s0/local/download_lm_en.sh +++ b/examples/librispeech/s0/local/download_lm_en.sh @@ -9,11 +9,13 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm MD5="099a601759d467cd0a8523ff939819c5" TARGET=${DIR}/common_crawl_00.prune01111.trie.klm -echo "Download language model ..." -download $URL $MD5 $TARGET +echo "Start downloading the language model. The language model is large, please wait for a moment ..." +download $URL $MD5 $TARGET > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Fail to download the language model!" exit 1 +else + echo "Download the language model sucessfully" fi diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/s0/local/test.sh index 25dd0437..4d00f30b 100755 --- a/examples/librispeech/s0/local/test.sh +++ b/examples/librispeech/s0/local/test.sh @@ -13,7 +13,7 @@ ckpt_prefix=$2 model_type=$3 # download language model -bash local/download_lm_en.sh > /dev/null 2>&1 +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/s0/run.sh index 385a24a4..6c4fc411 100755 --- a/examples/librispeech/s0/run.sh +++ b/examples/librispeech/s0/run.sh @@ -2,6 +2,7 @@ set -e source path.sh +gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml @@ -21,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} ${model_type} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -31,7 +32,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -41,5 +42,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 9ea93430..74f7cbc1 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -4,6 +4,7 @@ set -e . ./path.sh || exit 1; . ./cmd.sh || exit 1; +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/transformer.yaml @@ -24,7 +25,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -42,12 +43,12 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml index b2babca7..de1ac347 100644 --- a/examples/librispeech/s2/conf/transformer.yaml +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -1,32 +1,4 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/augmentation.json - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - # network architecture model: cmvn_file: @@ -63,6 +35,33 @@ model: lsm_weight: 0.1 # label smoothing option length_normalized_loss: false +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test-clean + +collator: + vocab_filepath: data/lang_char/train_960_unigram5000_units.txt + unit_type: spm + spm_model_prefix: data/lang_char/train_960_unigram5000 + feat_dim: 83 + stride_ms: 10.0 + window_ms: 25.0 + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 30 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/augmentation.json + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 + training: n_epoch: 120 diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh index ad6b6913..840835c2 100644 --- a/examples/librispeech/s2/path.sh +++ b/examples/librispeech/s2/path.sh @@ -14,6 +14,10 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ MODEL=u2_kaldi export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin +LM_MODEL=transformer +export LM_BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/lm/${LM_MODEL}/bin + + # srilm export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs @@ -25,4 +29,4 @@ export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" -[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh \ No newline at end of file +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index e014c2a9..facaafcb 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -5,6 +5,7 @@ set -e . ./path.sh || exit 1; . ./cmd.sh || exit 1; +gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/transformer.yaml @@ -24,7 +25,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -47,11 +48,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +# # export ckpt avg_n +# ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - CUDA_VISIBLE_DEVICES= ./local/cacu_perplexity.sh || exit -1 + ./local/cacu_perplexity.sh || exit -1 fi diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index e8e3ebff..a3e947e8 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -30,8 +30,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} Here's the complete help message. ```text usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR] - [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}] - [--nprocs NPROCS] [--opts ...] + [--checkpoint_path CHECKPOINT_PATH] [--ngpu NGPU] [--opts ...] optional arguments: -h, --help show this help message and exit @@ -41,16 +40,15 @@ optional arguments: --output OUTPUT_DIR path to save checkpoint and logs. --checkpoint_path CHECKPOINT_PATH path of the checkpoint to load - --device {cpu,gpu} device type to use, cpu and gpu are supported. - --nprocs NPROCS number of parallel processes to use. + --ngpu NGPU if ngpu == 0, use cpu. --opts ... options to overwrite --config file and the default config, passing in KEY VALUE pairs ``` -If you want to train on CPU, just set ``--device=cpu``. -If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU. -By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. -And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load. +If you want to train on CPU, just set `--ngpu=0`. +If you want to train on multiple GPUs, just set `--ngpu` as num of GPU. +By default, training will be resumed from the latest checkpoint in `--output`, if you want to start a new training, please use a new `${OUTPUTPATH}` with no checkpoint. +And if you want to resume from an other existing model, you should set `checkpoint_path` to be the checkpoint path you want to load. **Note: The checkpoint path cannot contain the file extension.** ### Synthesize @@ -60,7 +58,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_n ``` ```text usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH] - [--input INPUT] [--output OUTPUT] [--device DEVICE] + [--input INPUT] [--output OUTPUT] [--ngpu NGPU] [--opts ...] [-v] generate mel spectrogram with TransformerTTS. @@ -72,12 +70,12 @@ optional arguments: path of the checkpoint to load. --input INPUT path of the text sentences --output OUTPUT path to save outputs - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --opts ... options to overwrite --config file and the default config, passing in KEY VALUE pairs -v, --verbose print msg ``` -**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example) +**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example) ## Pretrained Models Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh index 91c89dd4..3f5f9c06 100755 --- a/examples/ljspeech/tts0/local/synthesize.sh +++ b/examples/ljspeech/tts0/local/synthesize.sh @@ -7,5 +7,5 @@ python3 ${BIN_DIR}/synthesize.py \ --config=${train_output_path}/config.yaml \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ --input=${BIN_DIR}/../sentences_en.txt \ - --output=${train_output_path}/test - --device=gpu \ No newline at end of file + --output=${train_output_path}/test \ + --ngpu=1 diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh index b8bcf5cb..a94f955a 100755 --- a/examples/ljspeech/tts0/local/train.sh +++ b/examples/ljspeech/tts0/local/train.sh @@ -6,4 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device=gpu \ \ No newline at end of file + --ngpu=1 \ \ No newline at end of file diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 0385fdce..fbb2c9d0 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -53,8 +53,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] Train a TransformerTTS model with LJSpeech TTS dataset. @@ -67,8 +66,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -76,12 +74,11 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ## Synthesize -We use [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder. +We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder. Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. ```bash unzip waveflow_ljspeech_ckpt_0.3.zip @@ -104,7 +101,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -127,7 +124,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. @@ -142,7 +139,7 @@ usage: synthesize_e2e.py [-h] [--waveflow-config WAVEFLOW_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--text TEXT] - [--output-dir OUTPUT_DIR] [--device DEVICE] + [--output-dir OUTPUT_DIR] [--ngpu NGPU] [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -165,7 +162,7 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model. @@ -173,7 +170,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) @@ -200,6 +197,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt ``` diff --git a/examples/ljspeech/tts1/local/synthesize.sh b/examples/ljspeech/tts1/local/synthesize.sh index 5d1c9534..9fe837a4 100755 --- a/examples/ljspeech/tts1/local/synthesize.sh +++ b/examples/ljspeech/tts1/local/synthesize.sh @@ -14,5 +14,4 @@ python3 ${BIN_DIR}/synthesize.py \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh index 333a5cd6..046fdb70 100755 --- a/examples/ljspeech/tts1/local/synthesize_e2e.sh +++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh @@ -14,5 +14,4 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/train.sh b/examples/ljspeech/tts1/local/train.sh index 8527f57f..5e255fb8 100755 --- a/examples/ljspeech/tts1/local/train.sh +++ b/examples/ljspeech/tts1/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=2 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index dc711ce8..bc38aac6 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -7,7 +7,7 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech ### Get MFA result of LJSpeech-1.1 and Extract it We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. +You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. @@ -58,8 +58,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -72,8 +72,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -83,12 +82,11 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ### Synthesize -We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_ljspeech_ckpt_0.5.zip @@ -112,7 +110,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -139,7 +137,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file. @@ -147,14 +145,15 @@ optional arguments: CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] + [--text TEXT] [--output-dir OUTPUT_DIR] + [--inference-dir INFERENCE_DIR] [--ngpu NGPU] + [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -178,7 +177,9 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --inference-dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -187,7 +188,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) @@ -215,6 +216,5 @@ python3 ${BIN_DIR}/synthesize_e2e_en.py \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt ``` diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh index 32dcde58..9b22abb3 100755 --- a/examples/ljspeech/tts3/local/synthesize.sh +++ b/examples/ljspeech/tts3/local/synthesize.sh @@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh index 28ea3a8f..c723feef 100755 --- a/examples/ljspeech/tts3/local/synthesize_e2e.sh +++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh @@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize_e2e_en.py \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh index 847a44e3..d1302f99 100755 --- a/examples/ljspeech/tts3/local/train.sh +++ b/examples/ljspeech/tts3/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md index 6163ae42..ad2337ef 100644 --- a/examples/ljspeech/voc0/README.md +++ b/examples/ljspeech/voc0/README.md @@ -31,10 +31,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_ The training script requires 4 command line arguments. 1. `--data` is the path of the training dataset. 2. `--output` is the path of the output directory. -3. `--device` should be "cpu" or "gpu" -4. `--nprocs` is the number of processes to train the model in parallel. +3. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. -If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet. +If you want distributed training, set a larger `--ngpu` (e.g. 4). Note that distributed training with cpu is not supported yet. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels. @@ -46,7 +45,7 @@ Synthesize waveform. 1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. 2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does. 3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here. -4. `--device` specifies to device to run synthesis on. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh index 055542cf..1d5e1183 100755 --- a/examples/ljspeech/voc0/local/synthesize.sh +++ b/examples/ljspeech/voc0/local/synthesize.sh @@ -8,5 +8,5 @@ python ${BIN_DIR}/synthesize.py \ --input=${input_mel_path} \ --output=${train_output_path}/wavs/ \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --device="gpu" \ + --ngpu=1 \ --verbose \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/train.sh b/examples/ljspeech/voc0/local/train.sh index 5c4defd9..f062cbbf 100755 --- a/examples/ljspeech/voc0/local/train.sh +++ b/examples/ljspeech/voc0/local/train.sh @@ -6,5 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device="gpu" \ - --nprocs=1 \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index ba6eb002..fdeac632 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -5,7 +5,7 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/). ### Get MFA results for silence trim We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. -You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. +You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. @@ -53,11 +53,10 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -70,8 +69,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. benchmark: @@ -91,8 +89,7 @@ benchmark: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. @@ -102,7 +99,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -115,7 +112,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -123,7 +120,7 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/ljspeech/voc1/local/train.sh +++ b/examples/ljspeech/voc1/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/examples/other/1xt2x/aishell/local/download_lm_ch.sh b/examples/other/1xt2x/aishell/local/download_lm_ch.sh index ac27a907..47153f4b 100755 --- a/examples/other/1xt2x/aishell/local/download_lm_ch.sh +++ b/examples/other/1xt2x/aishell/local/download_lm_ch.sh @@ -10,11 +10,13 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3" TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm -echo "Download language model ..." -download $URL $MD5 $TARGET +echo "Start downloading the language model. The language model is large, please wait for a moment ..." +download $URL $MD5 $TARGET > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Fail to download the language model!" exit 1 +else + echo "Download the language model sucessfully" fi diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh index d539ac49..2ae0740b 100755 --- a/examples/other/1xt2x/aishell/local/test.sh +++ b/examples/other/1xt2x/aishell/local/test.sh @@ -13,7 +13,7 @@ ckpt_prefix=$2 model_type=$3 # download language model -bash local/download_lm_ch.sh > /dev/null 2>&1 +bash local/download_lm_ch.sh if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh b/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh index dc1bdf66..390fffc9 100755 --- a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh +++ b/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh @@ -9,11 +9,13 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm MD5="099a601759d467cd0a8523ff939819c5" TARGET=${DIR}/common_crawl_00.prune01111.trie.klm -echo "Download language model ..." -download $URL $MD5 $TARGET +echo "Start downloading the language model. The language model is large, please wait for a moment ..." +download $URL $MD5 $TARGET > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Fail to download the language model!" exit 1 +else + echo "Download the language model sucessfully" fi diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh index 25dd0437..4d00f30b 100755 --- a/examples/other/1xt2x/baidu_en8k/local/test.sh +++ b/examples/other/1xt2x/baidu_en8k/local/test.sh @@ -13,7 +13,7 @@ ckpt_prefix=$2 model_type=$3 # download language model -bash local/download_lm_en.sh > /dev/null 2>&1 +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/other/1xt2x/librispeech/local/download_lm_en.sh b/examples/other/1xt2x/librispeech/local/download_lm_en.sh index dc1bdf66..390fffc9 100755 --- a/examples/other/1xt2x/librispeech/local/download_lm_en.sh +++ b/examples/other/1xt2x/librispeech/local/download_lm_en.sh @@ -9,11 +9,13 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm MD5="099a601759d467cd0a8523ff939819c5" TARGET=${DIR}/common_crawl_00.prune01111.trie.klm -echo "Download language model ..." -download $URL $MD5 $TARGET +echo "Start downloading the language model. The language model is large, please wait for a moment ..." +download $URL $MD5 $TARGET > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Fail to download the language model!" exit 1 +else + echo "Download the language model sucessfully" fi diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh index 25dd0437..4d00f30b 100755 --- a/examples/other/1xt2x/librispeech/local/test.sh +++ b/examples/other/1xt2x/librispeech/local/test.sh @@ -13,7 +13,7 @@ ckpt_prefix=$2 model_type=$3 # download language model -bash local/download_lm_en.sh > /dev/null 2>&1 +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md new file mode 100644 index 00000000..14bd0d9d --- /dev/null +++ b/examples/other/g2p/README.md @@ -0,0 +1,20 @@ +# G2P +For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones. + +You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`. + +We use `WER` as evaluation criterion. + +# Start +Run the command below to get the results of test. +```bash +./run.sh +``` +The `avg WER` of g2p is: 0.027495061517943988 +```text + ,--------------------------------------------------------------------. + | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | + |--------+-----------------+-----------------------------------------| + | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 | + `--------------------------------------------------------------------' +``` diff --git a/examples/other/text_frontend/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py similarity index 100% rename from examples/other/text_frontend/get_g2p_data.py rename to examples/other/g2p/get_g2p_data.py diff --git a/examples/other/g2p/run.sh b/examples/other/g2p/run.sh new file mode 100755 index 00000000..214b8b3d --- /dev/null +++ b/examples/other/g2p/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +source path.sh +USE_SCLITE=true + +# test g2p +echo "Start get g2p test data ..." +python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p +echo "Start test g2p ..." +python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p + +# whether use sclite to get more detail information of WER +if [ "$USE_SCLITE" = true ];then + echo "Start sclite g2p ..." + ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all +fi diff --git a/examples/other/text_frontend/test_g2p.py b/examples/other/g2p/test_g2p.py similarity index 100% rename from examples/other/text_frontend/test_g2p.py rename to examples/other/g2p/test_g2p.py diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md index 1fa9677a..d86c8c13 100644 --- a/examples/other/ge2e/README.md +++ b/examples/other/ge2e/README.md @@ -1,5 +1,5 @@ # Speaker Encoder -This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [examples/aishell3/vc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0). The trained speaker encoder is used to extract utterance embeddings from utterances. +This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [examples/aishell3/vc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0). The trained speaker encoder is used to extract utterance embeddings from utterances. ## Model The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used. @@ -70,8 +70,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_ In `${BIN_DIR}/train.py`: 1. `--data` is the path to the preprocessed dataset. 2. `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training. -3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported. -4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda. Other options are described below. @@ -91,7 +90,7 @@ In `${BIN_DIR}/inference.py`: 2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format. 3. `--checkpoint_path` is the path of the checkpoint to use, extension not included. 4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`. -5. `--device` and `--opts` have the same meaning as in the training script. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. diff --git a/examples/other/ge2e/local/inference.sh b/examples/other/ge2e/local/inference.sh index 1beebdfa..431c5309 100755 --- a/examples/other/ge2e/local/inference.sh +++ b/examples/other/ge2e/local/inference.sh @@ -10,5 +10,5 @@ python3 ${BIN_DIR}/inference.py \ --input=${infer_input} \ --output=${infer_output} \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --device="gpu" + --ngpu=1 diff --git a/examples/other/ge2e/local/train.sh b/examples/other/ge2e/local/train.sh index 5c4defd9..f062cbbf 100755 --- a/examples/other/ge2e/local/train.sh +++ b/examples/other/ge2e/local/train.sh @@ -6,5 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device="gpu" \ - --nprocs=1 \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/other/ge2e/path.sh b/examples/other/ge2e/path.sh index b4f77985..24305ef7 100755 --- a/examples/other/ge2e/path.sh +++ b/examples/other/ge2e/path.sh @@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} MODEL=ge2e -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} +export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL} diff --git a/examples/other/ngram_lm/READEME.md b/examples/other/ngram_lm/README.md similarity index 100% rename from examples/other/ngram_lm/READEME.md rename to examples/other/ngram_lm/README.md diff --git a/examples/other/text_frontend/README.md b/examples/other/text_frontend/README.md deleted file mode 100644 index 0bf6e72d..00000000 --- a/examples/other/text_frontend/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Chinese Text Frontend Example -Here's an example for Chinese text frontend, including g2p and text normalization. -## G2P -For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones. - -You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`. - -We use `WER` as evaluation criterion. -## Text Normalization -For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data. - -We use `CER` as evaluation criterion. -## Start -If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first. -```bash -./make_sclite.sh -``` -Run the command below to get the results of test. -```bash -./run.sh -``` -The `avg WER` of g2p is: 0.027495061517943988 -```text - ,--------------------------------------------------------------------. - | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | - |--------+-----------------+-----------------------------------------| - | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 | - `--------------------------------------------------------------------' -``` - -The `avg CER` of text normalization is: 0.006388318503308237 -```text - ,-----------------------------------------------------------------. - | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | - |--------+--------------+-----------------------------------------| - | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 | - `-----------------------------------------------------------------' -``` diff --git a/examples/other/text_frontend/make_sclite.sh b/examples/other/text_frontend/make_sclite.sh deleted file mode 100755 index db8c921c..00000000 --- a/examples/other/text_frontend/make_sclite.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if [ ! -d "./SCTK" ];then - echo "Clone SCTK ..." - git clone https://github.com/usnistgov/SCTK - echo "Clone SCTK done!" -fi - -if [ ! -d "./SCTK/bin" ];then - echo "Start make SCTK ..." - pushd SCTK && make config && make all && make check && make install && make doc && popd - echo "SCTK make done!" -fi diff --git a/examples/other/text_frontend/run.sh b/examples/other/text_frontend/run.sh deleted file mode 100755 index 9882b057..00000000 --- a/examples/other/text_frontend/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -USE_SCLITE=true - -# test g2p -echo "Start get g2p test data ..." -python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p -echo "Start test g2p ..." -python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p - -# test text normalization -echo "Start get text normalization test data ..." -python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm -echo "Start test text normalization ..." -python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm - -# whether use sclite to get more detail information of WER -if [ "$USE_SCLITE" = true ];then - echo "Start sclite g2p ..." - ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all - echo - - echo "Start sclite textnorm ..." - ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all -fi \ No newline at end of file diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md new file mode 100644 index 00000000..dfefccde --- /dev/null +++ b/examples/other/tn/README.md @@ -0,0 +1,17 @@ +# Text Normalization +For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data. + +We use `CER` as evaluation criterion. +## Start +Run the command below to get the results of test. +```bash +./run.sh +``` +The `avg CER` of text normalization is: 0.006388318503308237 +```text + ,-----------------------------------------------------------------. + | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | + |--------+--------------+-----------------------------------------| + | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 | + `-----------------------------------------------------------------' +``` diff --git a/examples/other/text_frontend/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt similarity index 100% rename from examples/other/text_frontend/data/textnorm_test_cases.txt rename to examples/other/tn/data/textnorm_test_cases.txt diff --git a/examples/other/text_frontend/get_textnorm_data.py b/examples/other/tn/get_textnorm_data.py similarity index 100% rename from examples/other/text_frontend/get_textnorm_data.py rename to examples/other/tn/get_textnorm_data.py diff --git a/examples/other/tn/run.sh b/examples/other/tn/run.sh new file mode 100755 index 00000000..e8298a84 --- /dev/null +++ b/examples/other/tn/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +source path.sh + +USE_SCLITE=true + +# test text normalization +echo "Start get text normalization test data ..." +python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm +echo "Start test text normalization ..." +python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm + +# whether use sclite to get more detail information of WER +if [ "$USE_SCLITE" = true ];then + echo "Start sclite textnorm ..." + ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all +fi \ No newline at end of file diff --git a/examples/other/text_frontend/test_textnorm.py b/examples/other/tn/test_textnorm.py similarity index 100% rename from examples/other/text_frontend/test_textnorm.py rename to examples/other/tn/test_textnorm.py diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml index f8dc383f..9c1ac91a 100644 --- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml @@ -99,6 +99,7 @@ decoding: alpha: 2.5 beta: 0.3 beam_size: 10 + word_reward: 0.7 cutoff_prob: 1.0 cutoff_top_n: 0 num_proc_bsearch: 8 diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh index 654d4dce..fb4bc338 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/t0/run.sh @@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index 4c0a5cdc..a95b5f3a 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -3,6 +3,7 @@ set -e . path.sh || exit 1; +gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml @@ -23,7 +24,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -33,7 +34,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -41,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/tiny/s0/local/download_lm_en.sh b/examples/tiny/s0/local/download_lm_en.sh index dc1bdf66..390fffc9 100755 --- a/examples/tiny/s0/local/download_lm_en.sh +++ b/examples/tiny/s0/local/download_lm_en.sh @@ -9,11 +9,13 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm MD5="099a601759d467cd0a8523ff939819c5" TARGET=${DIR}/common_crawl_00.prune01111.trie.klm -echo "Download language model ..." -download $URL $MD5 $TARGET +echo "Start downloading the language model. The language model is large, please wait for a moment ..." +download $URL $MD5 $TARGET > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Fail to download the language model!" exit 1 +else + echo "Download the language model sucessfully" fi diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/s0/local/test.sh index 25dd0437..4d00f30b 100755 --- a/examples/tiny/s0/local/test.sh +++ b/examples/tiny/s0/local/test.sh @@ -13,7 +13,7 @@ ckpt_prefix=$2 model_type=$3 # download language model -bash local/download_lm_en.sh > /dev/null 2>&1 +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index 6580afed..155eca17 100755 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -2,6 +2,7 @@ set -e source path.sh +gpus=0 stage=0 stop_stage=100 conf_path=conf/transformer.yaml @@ -20,7 +21,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -30,16 +31,16 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 717ee7ac..78bfb966 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle ### Get MFA result of VCTK and Extract it We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. -ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)): +You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. +ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)): 1. `p315`, because no txt for it. 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. @@ -61,8 +61,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -75,8 +75,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -86,12 +85,10 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--phones-dict` is the path of the phone vocabulary file. ### Synthesize -We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it. ```bash @@ -116,7 +113,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -143,7 +140,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file. @@ -161,7 +158,7 @@ usage: multi_spk_synthesize_e2e_en.py [-h] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -187,7 +184,7 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -196,7 +193,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) @@ -218,14 +215,13 @@ FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ --fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \ + --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \ --fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \ --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ - --text=${BIN_DIR}/../sentences.txt \ + --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ --speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt ``` diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh index ca112969..8165c858 100755 --- a/examples/vctk/tts3/local/synthesize.sh +++ b/examples/vctk/tts3/local/synthesize.sh @@ -15,6 +15,5 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index d919bb08..e0b2a041 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -15,6 +15,5 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh index be6051c9..3a507650 100755 --- a/examples/vctk/tts3/local/train.sh +++ b/examples/vctk/tts3/local/train.sh @@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index cbfff32d..c9ecae0d 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handl ### Get MFA results for silence trim We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. -You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo. -ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)): +You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo. +ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)): 1. `p315`, because no txt for it. 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. @@ -58,9 +58,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -74,8 +73,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. benchmark: @@ -95,8 +93,8 @@ benchmark: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash @@ -105,7 +103,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -118,7 +116,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -126,7 +124,7 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip). diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/vctk/voc1/local/train.sh +++ b/examples/vctk/voc1/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/paddlespeech/cls/__init__.py b/paddlespeech/cls/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/cls/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py index c089f96c..d06125b7 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py @@ -126,8 +126,12 @@ decoders_module = [ ] setup( - name='swig_decoders', - version='1.1', - description="""CTC decoders""", + name='paddlespeech_ctcdecoders', + version='0.0.1a', + description="CTC decoders in paddlespeech", + author="PaddlePaddle Speech and Language Team", + author_email="paddlesl@baidu.com", + url="https://github.com/PaddlePaddle/PaddleSpeech", + license='Apache 2.0', ext_modules=decoders_module, - py_modules=['swig_decoders'], ) + py_modules=['swig_decoders']) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index b6b34d08..e073ebbf 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -39,6 +39,8 @@ if __name__ == "__main__": "--export_path", type=str, help="path of the jit model to save") parser.add_argument( "--model_type", type=str, default='offline', help='offline/online') + parser.add_argument( + "--enable-auto-log", action="store_true", help="use auto log") args = parser.parse_args() print_arguments(args, globals()) print("model_type:{}".format(args.model_type)) diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 683fab14..177d710b 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -42,7 +42,6 @@ from paddlespeech.s2t.training.trainer import Trainer from paddlespeech.s2t.utils import error_rate from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils import mp_tools -from paddlespeech.s2t.utils.log import Autolog from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.utility import UpdateConfig @@ -339,8 +338,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): error_rate_type=cfg.error_rate_type) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): - self.autolog.times.start() - self.autolog.times.stamp() result_transcripts = self.model.decode( audio, audio_len, @@ -354,19 +351,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch) - self.autolog.times.stamp() - self.autolog.times.stamp() - self.autolog.times.end() return result_transcripts @mp_tools.rank_zero_only @paddle.no_grad() def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, - model_name="deepspeech2", - model_precision="fp32").getlog() self.model.eval() cfg = self.config error_rate_type = None @@ -390,7 +380,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) - self.autolog.report() @paddle.no_grad() def export(self): @@ -414,6 +403,43 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): def __init__(self, config, args): super().__init__(config, args) self.apply_static = True + self.args = args + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + if self.args.enable_auto_log == True: + from paddlespeech.s2t.utils.log import Autolog + self.autolog = Autolog( + batch_size=self.config.decoding.batch_size, + model_name="deepspeech2", + model_precision="fp32").getlog() + self.model.eval() + cfg = self.config + error_rate_type = None + errors_sum, len_refs, num_ins = 0.0, 0, 0 + with jsonlines.open(self.args.result_file, 'w') as fout: + for i, batch in enumerate(self.test_loader): + utts, audio, audio_len, texts, texts_len = batch + metrics = self.compute_metrics(utts, audio, audio_len, texts, + texts_len, fout) + errors_sum += metrics['errors_sum'] + len_refs += metrics['len_refs'] + num_ins += metrics['num_ins'] + error_rate_type = metrics['error_rate_type'] + logger.info("Error rate [%s] (%d/?) = %f" % + (error_rate_type, num_ins, errors_sum / len_refs)) + + # logging + msg = "Test: " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "Final error rate [%s] (%d/%d) = %f" % ( + error_rate_type, num_ins, num_ins, errors_sum / len_refs) + logger.info(msg) + if self.args.enable_auto_log == True: + self.autolog.report() def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): if self.args.model_type == "online": @@ -486,8 +512,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): x_len_list = np.split(x_len_batch, batch_size, axis=0) for x, x_len in zip(x_list, x_len_list): - self.autolog.times.start() - self.autolog.times.stamp() + if self.args.enable_auto_log == True: + self.autolog.times.start() x_len = x_len[0] assert (chunk_size <= x_len) @@ -521,6 +547,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): probs_chunk_list = [] probs_chunk_lens_list = [] + if self.args.enable_auto_log == True: + # record the model preprocessing time + self.autolog.times.stamp() + for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size @@ -576,9 +606,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): [output_probs, output_probs_padding], axis=1) output_probs_list.append(output_probs) output_lens_list.append(output_lens) - self.autolog.times.stamp() - self.autolog.times.stamp() - self.autolog.times.end() + if self.args.enable_auto_log == True: + # record the model inference time + self.autolog.times.stamp() + # record the post processing time + self.autolog.times.stamp() + self.autolog.times.end() output_probs = np.concatenate(output_probs_list, axis=0) output_lens = np.concatenate(output_lens_list, axis=0) return output_probs, output_lens @@ -608,12 +641,17 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): audio_len_handle.reshape(x_len.shape) audio_len_handle.copy_from_cpu(x_len) - self.autolog.times.start() - self.autolog.times.stamp() + if self.args.enable_auto_log == True: + self.autolog.times.start() + # record the prefix processing time + self.autolog.times.stamp() self.predictor.run() - self.autolog.times.stamp() - self.autolog.times.stamp() - self.autolog.times.end() + if self.args.enable_auto_log == True: + # record the model inference time + self.autolog.times.stamp() + # record the post processing time + self.autolog.times.stamp() + self.autolog.times.end() output_names = self.predictor.get_output_names() output_handle = self.predictor.get_output_handle(output_names[0]) @@ -624,11 +662,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): def setup_model(self): super().setup_model() - speedyspeech_config = inference.Config( + deepspeech_config = inference.Config( self.args.export_path + ".pdmodel", self.args.export_path + ".pdiparams") if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): - speedyspeech_config.enable_use_gpu(100, 0) - speedyspeech_config.enable_memory_optim() - speedyspeech_predictor = inference.create_predictor(speedyspeech_config) - self.predictor = speedyspeech_predictor + deepspeech_config.enable_use_gpu(100, 0) + deepspeech_config.enable_memory_optim() + deepspeech_predictor = inference.create_predictor(deepspeech_config) + self.predictor = deepspeech_predictor diff --git a/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py b/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py index ab0ec8f0..e628f323 100644 --- a/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py +++ b/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py @@ -19,8 +19,8 @@ import paddle from paddle.io import DataLoader from yacs.config import CfgNode -from paddlespeech.s2t.io.collator import TextCollatorSpm -from paddlespeech.s2t.io.dataset import TextDataset +from paddlespeech.s2t.models.lm.dataset import TextCollatorSpm +from paddlespeech.s2t.models.lm.dataset import TextDataset from paddlespeech.s2t.models.lm_interface import dynamic_import_lm from paddlespeech.s2t.utils.log import Log diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 7eed9391..9f5448cc 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -208,12 +208,14 @@ class U2Trainer(Trainer): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips[sent./sec]'] = observation[ + observation['ips,sent./sec'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): - msg += f" {k}: " + msg += f" {k.split(',')[0]}: " msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" + msg += f" {k.split(',')[1]}" if len( + k.split(',')) == 2 else f"" msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1 diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 91390afe..52d3c3b7 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -441,10 +441,7 @@ class U2STTester(U2STTrainer): "".join(chr(t) for t in text[:text_len]) for text, text_len in zip(texts, texts_len) ] - # from IPython import embed - # import os - # embed() - # os._exit(0) + hyps = self.model.decode( audio, audio_len, @@ -458,6 +455,7 @@ class U2STTester(U2STTrainer): cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch, ctc_weight=cfg.ctc_weight, + word_reward=cfg.word_reward, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, simulate_streaming=cfg.simulate_streaming) diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 35b86871..5f233549 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -19,7 +19,6 @@ from yacs.config import CfgNode from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline from paddlespeech.s2t.frontend.featurizer.speech_featurizer import SpeechFeaturizer -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer from paddlespeech.s2t.frontend.speech import SpeechSegment from paddlespeech.s2t.frontend.utility import IGNORE_ID @@ -46,43 +45,6 @@ def _tokenids(text, keep_transcription_text): return tokens -class TextCollatorSpm(): - def __init__(self, unit_type, vocab_filepath, spm_model_prefix): - assert (vocab_filepath is not None) - self.text_featurizer = TextFeaturizer( - unit_type=unit_type, - vocab_filepath=vocab_filepath, - spm_model_prefix=spm_model_prefix) - self.eos_id = self.text_featurizer.eos_id - self.blank_id = self.text_featurizer.blank_id - - def __call__(self, batch): - """ - return type [List, np.array [B, T], np.array [B, T], np.array[B]] - """ - keys = [] - texts = [] - texts_input = [] - texts_output = [] - text_lens = [] - - for idx, item in enumerate(batch): - key = item.split(" ")[0].strip() - text = " ".join(item.split(" ")[1:]) - keys.append(key) - token_ids = self.text_featurizer.featurize(text) - texts_input.append( - np.array([self.eos_id] + token_ids).astype(np.int64)) - texts_output.append( - np.array(token_ids + [self.eos_id]).astype(np.int64)) - text_lens.append(len(token_ids) + 1) - - ys_input_pad = pad_list(texts_input, self.blank_id).astype(np.int64) - ys_output_pad = pad_list(texts_output, self.blank_id).astype(np.int64) - y_lens = np.array(text_lens).astype(np.int64) - return keys, ys_input_pad, ys_output_pad, y_lens - - class SpeechCollatorBase(): def __init__( self, diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index c5df2d6b..61eeb00f 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -26,25 +26,6 @@ __all__ = ["ManifestDataset", "TransformDataset"] logger = Log(__name__).getlog() -class TextDataset(Dataset): - @classmethod - def from_file(cls, file_path): - dataset = cls(file_path) - return dataset - - def __init__(self, file_path): - self._manifest = [] - with open(file_path) as f: - for line in f: - self._manifest.append(line.strip()) - - def __len__(self): - return len(self._manifest) - - def __getitem__(self, idx): - return self._manifest[idx] - - class ManifestDataset(Dataset): @classmethod def params(cls, config: Optional[CfgNode]=None) -> CfgNode: diff --git a/paddlespeech/s2t/models/lm/dataset.py b/paddlespeech/s2t/models/lm/dataset.py new file mode 100644 index 00000000..4059dfe2 --- /dev/null +++ b/paddlespeech/s2t/models/lm/dataset.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from paddle.io import Dataset + +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.io.utility import pad_list + + +class TextDataset(Dataset): + @classmethod + def from_file(cls, file_path): + dataset = cls(file_path) + return dataset + + def __init__(self, file_path): + self._manifest = [] + with open(file_path) as f: + for line in f: + self._manifest.append(line.strip()) + + def __len__(self): + return len(self._manifest) + + def __getitem__(self, idx): + return self._manifest[idx] + + +class TextCollatorSpm(): + def __init__(self, unit_type, vocab_filepath, spm_model_prefix): + assert (vocab_filepath is not None) + self.text_featurizer = TextFeaturizer( + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix) + self.eos_id = self.text_featurizer.eos_id + self.blank_id = self.text_featurizer.blank_id + + def __call__(self, batch): + """ + return type [List, np.array [B, T], np.array [B, T], np.array[B]] + """ + keys = [] + texts = [] + texts_input = [] + texts_output = [] + text_lens = [] + + for idx, item in enumerate(batch): + key = item.split(" ")[0].strip() + text = " ".join(item.split(" ")[1:]) + keys.append(key) + token_ids = self.text_featurizer.featurize(text) + texts_input.append( + np.array([self.eos_id] + token_ids).astype(np.int64)) + texts_output.append( + np.array(token_ids + [self.eos_id]).astype(np.int64)) + text_lens.append(len(token_ids) + 1) + + ys_input_pad = pad_list(texts_input, self.blank_id).astype(np.int64) + ys_output_pad = pad_list(texts_output, self.blank_id).astype(np.int64) + y_lens = np.array(text_lens).astype(np.int64) + return keys, ys_input_pad, ys_output_pad, y_lens diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 76c50150..a83e6707 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -315,6 +315,7 @@ class U2STBaseModel(nn.Layer): speech: paddle.Tensor, speech_lengths: paddle.Tensor, beam_size: int=10, + word_reward: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, simulate_streaming: bool=False, ) -> paddle.Tensor: @@ -378,6 +379,7 @@ class U2STBaseModel(nn.Layer): # 2.2 First beam prune: select topk best prob at current time top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) + top_k_logp += word_reward top_k_logp = mask_finished_scores(top_k_logp, end_flag) top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) @@ -528,6 +530,7 @@ class U2STBaseModel(nn.Layer): cutoff_top_n: int, num_processes: int, ctc_weight: float=0.0, + word_reward: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, simulate_streaming: bool=False): @@ -569,6 +572,7 @@ class U2STBaseModel(nn.Layer): feats, feats_lengths, beam_size=beam_size, + word_reward=word_reward, decoding_chunk_size=decoding_chunk_size, num_decoding_left_chunks=num_decoding_left_chunks, simulate_streaming=simulate_streaming) diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 5ed9aa7a..9470f923 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples): def fastspeech2_multi_spk_batch_fn(examples): - # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"] + # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"] text = [np.array(item["text"], dtype=np.int64) for item in examples] speech = [np.array(item["speech"], dtype=np.float32) for item in examples] pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] @@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples): speech_lengths = [ np.array(item["speech_lengths"], dtype=np.int64) for item in examples ] - spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] text = batch_sequences(text) pitch = batch_sequences(pitch) @@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples): energy = paddle.to_tensor(energy) text_lengths = paddle.to_tensor(text_lengths) speech_lengths = paddle.to_tensor(speech_lengths) - spk_id = paddle.to_tensor(spk_id) batch = { "text": text, @@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples): "speech": speech, "speech_lengths": speech_lengths, "pitch": pitch, - "energy": energy, - "spk_id": spk_id + "energy": energy } + # spk_emb has a higher priority than spk_id + if "spk_emb" in examples[0]: + spk_emb = [ + np.array(item["spk_emb"], dtype=np.float32) for item in examples + ] + spk_emb = batch_sequences(spk_emb) + spk_emb = paddle.to_tensor(spk_emb) + batch["spk_emb"] = spk_emb + elif "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id return batch diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index 2de4fb12..2e4f740f 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -110,10 +110,10 @@ class Clip(object): if len(x) < c.shape[0] * self.hop_size: x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge") elif len(x) > c.shape[0] * self.hop_size: - print( - f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })" - ) - x = x[:c.shape[1] * self.hop_size] + # print( + # f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })" + # ) + x = x[:c.shape[0] * self.hop_size] # check the legnth is valid assert len(x) == c.shape[ diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py new file mode 100644 index 00000000..8a9ef370 --- /dev/null +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -0,0 +1,167 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# generate mels using durations.txt +# for mb melgan finetune +# 长度和原本的 mel 不一致怎么办? +import argparse +from pathlib import Path + +import numpy as np +import paddle +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference +from paddlespeech.t2s.modules.normalizer import ZScore + + +def evaluate(args, fastspeech2_config): + + # construct dataset for evaluation + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + phone_dict = {} + for phn, id in phn_id: + phone_dict[phn] = int(id) + + odim = fastspeech2_config.n_mels + model = FastSpeech2( + idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + + model.set_state_dict( + paddle.load(args.fastspeech2_checkpoint)["main_params"]) + model.eval() + + stat = np.load(args.fastspeech2_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + fastspeech2_normalizer = ZScore(mu, std) + + fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer, + model) + fastspeech2_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences, speaker_set = get_phn_dur(args.dur_file) + merge_silence(sentences) + + for i, utt_id in enumerate(sentences): + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + # 裁剪掉开头和结尾的 sil + if args.cut_sil: + if phones[0] == "sil" and len(durations) > 1: + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + durations = durations[:-1] + phones = phones[:-1] + # sentences[utt_id][0] = phones + # sentences[utt_id][1] = durations + + phone_ids = [phone_dict[phn] for phn in phones] + phone_ids = paddle.to_tensor(np.array(phone_ids)) + durations = paddle.to_tensor(np.array(durations)) + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 + # split data into 3 sections + if args.dataset == "baker": + num_train = 9800 + num_dev = 100 + if i in range(0, num_train): + sub_output_dir = output_dir / ("train/raw") + elif i in range(num_train, num_train + num_dev): + sub_output_dir = output_dir / ("dev/raw") + else: + sub_output_dir = output_dir / ("test/raw") + sub_output_dir.mkdir(parents=True, exist_ok=True) + with paddle.no_grad(): + mel = fastspeech2_inference(phone_ids, durations=durations) + np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with fastspeech2 & parallel wavegan.") + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, ljspeech, vctk} now") + parser.add_argument( + "--fastspeech2-config", type=str, help="fastspeech2 config file.") + parser.add_argument( + "--fastspeech2-checkpoint", + type=str, + help="fastspeech2 checkpoint to load.") + parser.add_argument( + "--fastspeech2-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training fastspeech2." + ) + + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.fastspeech2_config) as f: + fastspeech2_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(fastspeech2_config) + + evaluate(args, fastspeech2_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index 98cf9f8f..1839415e 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( @@ -87,26 +87,27 @@ def evaluate(args, fastspeech2_config, pwg_config): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # only test the number 0 speaker - spk_id = 0 - for utt_id, sentence in sentences: - input_ids = frontend.get_input_ids(sentence, merge_sentences=True) - phone_ids = input_ids["phone_ids"] - flags = 0 - for part_phone_ids in phone_ids: - with paddle.no_grad(): - mel = fastspeech2_inference( - part_phone_ids, spk_id=paddle.to_tensor(spk_id)) - temp_wav = pwg_inference(mel) - if flags == 0: - wav = temp_wav - flags = 1 - else: - wav = paddle.concat([wav, temp_wav]) - sf.write( - str(output_dir / (str(spk_id) + "_" + utt_id + ".wav")), - wav.numpy(), - samplerate=fastspeech2_config.fs) - print(f"{spk_id}_{utt_id} done!") + spk_ids = list(range(20)) + for spk_id in spk_ids: + for utt_id, sentence in sentences[:2]: + input_ids = frontend.get_input_ids(sentence, merge_sentences=True) + phone_ids = input_ids["phone_ids"] + flags = 0 + for part_phone_ids in phone_ids: + with paddle.no_grad(): + mel = fastspeech2_inference( + part_phone_ids, spk_id=paddle.to_tensor(spk_id)) + temp_wav = pwg_inference(mel) + if flags == 0: + wav = temp_wav + flags = 1 + else: + wav = paddle.concat([wav, temp_wav]) + sf.write( + str(output_dir / (str(spk_id) + "_" + utt_id + ".wav")), + wav.numpy(), + samplerate=fastspeech2_config.fs) + print(f"{spk_id}_{utt_id} done!") def main(): @@ -145,12 +146,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index 9e29eea1..095d2082 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( @@ -154,12 +154,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py index 7283f6b4..8ec20ebf 100644 --- a/paddlespeech/t2s/exps/fastspeech2/normalize.py +++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py @@ -167,6 +167,10 @@ def main(): "pitch": str(pitch_path), "energy": str(energy_path) } + # add spk_emb for voice cloning + if "spk_emb" in item: + record["spk_emb"] = str(item["spk_emb"]) + output_metadata.append(record) output_metadata.sort(key=itemgetter('utt_id')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 3702ecd3..b874b3a7 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any], mel_extractor=None, pitch_extractor=None, energy_extractor=None, - cut_sil: bool=True): + cut_sil: bool=True, + spk_emb_dir: Path=None): utt_id = fp.stem # for vctk if utt_id.endswith("_mic2"): @@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any], "energy": str(energy_path), "speaker": speaker } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None return record @@ -127,13 +136,14 @@ def process_sentences(config, pitch_extractor=None, energy_extractor=None, nprocs: int=1, - cut_sil: bool=True): + cut_sil: bool=True, + spk_emb_dir: Path=None): if nprocs == 1: results = [] for fp in fps: record = process_sentence(config, fp, sentences, output_dir, mel_extractor, pitch_extractor, - energy_extractor, cut_sil) + energy_extractor, cut_sil, spk_emb_dir) if record: results.append(record) else: @@ -144,7 +154,7 @@ def process_sentences(config, future = pool.submit(process_sentence, config, fp, sentences, output_dir, mel_extractor, pitch_extractor, energy_extractor, - cut_sil) + cut_sil, spk_emb_dir) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -202,6 +212,11 @@ def main(): default=True, help="whether cut sil in the edge of audio") + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") args = parser.parse_args() rootdir = Path(args.rootdir).expanduser() @@ -211,6 +226,11 @@ def main(): dumpdir.mkdir(parents=True, exist_ok=True) dur_file = Path(args.dur_file).expanduser() + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + assert rootdir.is_dir() assert dur_file.is_file() @@ -251,6 +271,7 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "ljspeech": wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) # split data into 3 sections @@ -317,7 +338,8 @@ def main(): pitch_extractor, energy_extractor, nprocs=args.num_cpu, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if dev_wav_files: process_sentences( config, @@ -327,7 +349,8 @@ def main(): mel_extractor, pitch_extractor, energy_extractor, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if test_wav_files: process_sentences( config, @@ -338,7 +361,8 @@ def main(): pitch_extractor, energy_extractor, nprocs=args.num_cpu, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py index 1beac5ce..249845e4 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py @@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config): fields = ["utt_id", "text"] + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] + elif args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] else: print("single speaker fastspeech2!") - num_speakers = None - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) test_dataset = DataTable(data=test_metadata, fields=fields) @@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config): model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( @@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config): for datum in test_dataset: utt_id = datum["utt_id"] text = paddle.to_tensor(datum["text"]) - if "spk_id" in datum: + spk_emb = None + spk_id = None + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + elif "spk_id" in datum: spk_id = paddle.to_tensor(datum["spk_id"]) - else: - spk_id = None with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id)) + wav = pwg_inference( + fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), @@ -142,15 +148,28 @@ def main(): type=str, default=None, help="speaker id map file for multiple speaker model.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py index b6a8fc58..ff9a41ea 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py @@ -155,12 +155,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py index 7a55fbb1..6e3434a7 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py @@ -145,12 +145,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py index 92a43d5c..f0ff5655 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py @@ -160,12 +160,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 5662d15d..fafded6f 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -43,7 +43,7 @@ from paddlespeech.t2s.training.trainer import Trainer def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -61,18 +61,24 @@ def train_sp(args, config): "text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy" ] + converters = {"speech": np.load, "pitch": np.load, "energy": np.load} + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] + elif args.voice_cloning: + print("Training voice cloning!") + collate_fn = fastspeech2_multi_spk_batch_fn + fields += ["spk_emb"] + converters["spk_emb"] = np.load else: print("single speaker fastspeech2!") collate_fn = fastspeech2_single_spk_batch_fn - num_speakers = None - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -83,17 +89,13 @@ def train_sp(args, config): train_dataset = DataTable( data=train_metadata, fields=fields, - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) + converters=converters, ) with jsonlines.open(args.dev_metadata, 'r') as reader: dev_metadata = list(reader) dev_dataset = DataTable( data=dev_metadata, fields=fields, - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) + converters=converters, ) # collate function and dataloader @@ -127,10 +129,7 @@ def train_sp(args, config): odim = config.n_mels model = FastSpeech2( - idim=vocab_size, - odim=odim, - num_speakers=num_speakers, - **config["model"]) + idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") @@ -174,9 +173,7 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") @@ -186,9 +183,16 @@ def main(): default=None, help="speaker id map file for multiple speaker model.") + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") + args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) @@ -202,8 +206,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py new file mode 100644 index 00000000..9fbd4964 --- /dev/null +++ b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py @@ -0,0 +1,208 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference +from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator +from paddlespeech.t2s.models.parallel_wavegan import PWGInference +from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder + + +def voice_cloning(args, fastspeech2_config, pwg_config): + # speaker encoder + p = SpeakerVerificationPreprocessor( + sampling_rate=16000, + audio_norm_target_dBFS=-30, + vad_window_length=30, + vad_moving_average_width=8, + vad_max_silence_length=6, + mel_window_length=25, + mel_window_step=10, + n_mels=40, + partial_n_frames=160, + min_pad_coverage=0.75, + partial_overlap_ratio=0.5) + print("Audio Processor Done!") + + speaker_encoder = LSTMSpeakerEncoder( + n_mels=40, num_layers=3, hidden_size=256, output_size=256) + speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) + speaker_encoder.eval() + print("GE2E Done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + odim = fastspeech2_config.n_mels + model = FastSpeech2( + idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + + model.set_state_dict( + paddle.load(args.fastspeech2_checkpoint)["main_params"]) + model.eval() + + vocoder = PWGGenerator(**pwg_config["generator_params"]) + vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) + vocoder.remove_weight_norm() + vocoder.eval() + print("model done!") + + frontend = Frontend(phone_vocab_path=args.phones_dict) + print("frontend done!") + + stat = np.load(args.fastspeech2_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + fastspeech2_normalizer = ZScore(mu, std) + + stat = np.load(args.pwg_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + pwg_normalizer = ZScore(mu, std) + + fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model) + fastspeech2_inference.eval() + pwg_inference = PWGInference(pwg_normalizer, vocoder) + pwg_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_dir = Path(args.input_dir) + + sentence = args.text + + input_ids = frontend.get_input_ids(sentence, merge_sentences=True) + phone_ids = input_ids["phone_ids"][0] + + for name in os.listdir(input_dir): + utt_id = name.split(".")[0] + ref_audio_path = input_dir / name + mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) + # print("mel_sequences: ", mel_sequences.shape) + with paddle.no_grad(): + spk_emb = speaker_encoder.embed_utterance( + paddle.to_tensor(mel_sequences)) + # print("spk_emb shape: ", spk_emb.shape) + + with paddle.no_grad(): + wav = pwg_inference( + fastspeech2_inference(phone_ids, spk_emb=spk_emb)) + + sf.write( + str(output_dir / (utt_id + ".wav")), + wav.numpy(), + samplerate=fastspeech2_config.fs) + print(f"{utt_id} done!") + # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb + random_spk_emb = np.random.rand(256) * 0.2 + random_spk_emb = paddle.to_tensor(random_spk_emb) + utt_id = "random_spk_emb" + with paddle.no_grad(): + wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb)) + sf.write( + str(output_dir / (utt_id + ".wav")), + wav.numpy(), + samplerate=fastspeech2_config.fs) + print(f"{utt_id} done!") + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--fastspeech2-config", type=str, help="fastspeech2 config file.") + parser.add_argument( + "--fastspeech2-checkpoint", + type=str, + help="fastspeech2 checkpoint to load.") + parser.add_argument( + "--fastspeech2-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training fastspeech2." + ) + parser.add_argument( + "--pwg-config", type=str, help="parallel wavegan config file.") + parser.add_argument( + "--pwg-checkpoint", + type=str, + help="parallel wavegan generator parameters to load.") + parser.add_argument( + "--pwg-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." + ) + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + parser.add_argument( + "--text", + type=str, + default="每当你觉得,想要批评什么人的时候,你切要记着,这个世界上的人,并非都具备你禀有的条件。", + help="text to synthesize, a line") + + parser.add_argument( + "--ge2e_params_path", type=str, help="ge2e params path.") + + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + + parser.add_argument( + "--input-dir", + type=str, + help="input dir of *.wav, the sample rate will be resample to 16k.") + parser.add_argument("--output-dir", type=str, help="output dir.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.fastspeech2_config) as f: + fastspeech2_config = CfgNode(yaml.safe_load(f)) + with open(args.pwg_config) as f: + pwg_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(fastspeech2_config) + print(pwg_config) + + voice_cloning(args, fastspeech2_config, pwg_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py index 720b08ce..988d4590 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py @@ -30,14 +30,14 @@ from paddlespeech.t2s.models.melgan import MelGANGenerator def main(): parser = argparse.ArgumentParser( - description="Synthesize with parallel wavegan.") + description="Synthesize with multi band melgan.") parser.add_argument( - "--config", type=str, help="parallel wavegan config file.") + "--config", type=str, help="multi band melgan config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device to run.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() @@ -53,7 +53,12 @@ def main(): f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" ) - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") generator = MelGANGenerator(**config["generator_params"]) state_dict = paddle.load(args.checkpoint) generator.set_state_dict(state_dict["generator_params"]) diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 45704607..ca3c0a1f 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -50,7 +50,7 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -238,14 +238,10 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) @@ -259,8 +255,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py index ce90aaf4..f275ed44 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py @@ -37,7 +37,7 @@ def main(): parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device to run.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() @@ -53,7 +53,12 @@ def main(): f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" ) - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") generator = PWGGenerator(**config["generator_params"]) state_dict = paddle.load(args.checkpoint) generator.set_state_dict(state_dict["generator_params"]) diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index a04a547e..ca2e3f55 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -92,12 +92,17 @@ def main(): parser.add_argument("--input-dir", type=str, help="input dir of wavs.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device to run.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 99801267..42ef8830 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -47,7 +47,7 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -215,9 +215,7 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") benchmark_group = parser.add_argument_group( @@ -241,8 +239,6 @@ def main(): ) args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) @@ -261,8 +257,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 852b0c91..782fbdf2 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -226,8 +226,22 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files else: - print("dataset should in {baker, ljspeech, vctk} now!") + print("dataset should in {baker, ljspeech, vctk, aishell3} now!") train_dump_dir = dumpdir / "train" / "raw" train_dump_dir.mkdir(parents=True, exist_ok=True) diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index 49ce37f2..617848c5 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -96,10 +96,10 @@ def main(): input_ids = frontend.get_input_ids( sentence, merge_sentences=True, get_tone_ids=True) - phone_ids = input_ids["phone_ids"].numpy() - tone_ids = input_ids["tone_ids"].numpy() - phones = phone_ids[0] - tones = tone_ids[0] + phone_ids = input_ids["phone_ids"] + tone_ids = input_ids["tone_ids"] + phones = phone_ids[0].numpy() + tones = tone_ids[0].numpy() if args.enable_auto_log: logger.times.stamp() diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize.py b/paddlespeech/t2s/exps/speedyspeech/synthesize.py index 4482c179..67d56ea5 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize.py @@ -155,12 +155,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose") args, _ = parser.parse_known_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.speedyspeech_config) as f: speedyspeech_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 0870d466..0e64088d 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -170,13 +170,18 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir") parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") - parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") parser.add_argument("--verbose", type=int, default=1, help="verbose") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") args, _ = parser.parse_known_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.speedyspeech_config) as f: speedyspeech_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 772a39d7..d9a2fbf4 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -43,7 +43,7 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -167,9 +167,7 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): @@ -189,8 +187,7 @@ def main(): # 这里可以多传入 max_epoch 等 args, rest = parser.parse_known_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") + with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) @@ -212,8 +209,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.py b/paddlespeech/t2s/exps/tacotron2/synthesize.py index 613fec02..c73c32d2 100644 --- a/paddlespeech/t2s/exps/tacotron2/synthesize.py +++ b/paddlespeech/t2s/exps/tacotron2/synthesize.py @@ -25,7 +25,12 @@ from paddlespeech.t2s.utils import display def main(config, args): - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") # model frontend = EnglishCharacter() @@ -77,7 +82,7 @@ if __name__ == "__main__": parser.add_argument("--input", type=str, help="path of the text sentences") parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument( - "--device", type=str, default="cpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument( "--opts", nargs=argparse.REMAINDER, diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index a5f08360..8198348f 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -199,8 +199,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/paddlespeech/t2s/exps/transformer_tts/synthesize.py index 82fd8f15..666c3b72 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py @@ -117,12 +117,17 @@ def main(): parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.transformer_tts_config) as f: transformer_tts_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py index 993749f0..ba197f43 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py @@ -136,12 +136,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.transformer_tts_config) as f: transformer_tts_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 7d9020a3..163339f4 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -42,7 +42,7 @@ from paddlespeech.t2s.training.trainer import Trainer def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -164,16 +164,12 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) @@ -187,8 +183,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py index ceae1360..8af1d45e 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py +++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py @@ -241,8 +241,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py index c76ce007..4e6b8d36 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py @@ -20,14 +20,14 @@ import paddle import soundfile as sf from matplotlib import pyplot as plt -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder from paddlespeech.t2s.models.tacotron2 import Tacotron2 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow from paddlespeech.t2s.utils import display +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder def voice_cloning(args): @@ -140,8 +140,9 @@ def main(): "--tacotron2_params_path", type=str, help="tacotron2 params path.") parser.add_argument( "--waveflow_params_path", type=str, help="waveflow params path.") + parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") parser.add_argument( "--input-dir", @@ -151,7 +152,12 @@ def main(): args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") voice_cloning(args) diff --git a/paddlespeech/t2s/exps/waveflow/synthesize.py b/paddlespeech/t2s/exps/waveflow/synthesize.py index 4f07aa4e..53715b01 100644 --- a/paddlespeech/t2s/exps/waveflow/synthesize.py +++ b/paddlespeech/t2s/exps/waveflow/synthesize.py @@ -25,7 +25,13 @@ from paddlespeech.t2s.utils import layer_tools def main(config, args): - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path) layer_tools.recursively_remove_weight_norm(model) model.eval() @@ -60,7 +66,7 @@ if __name__ == "__main__": help="path of directory containing mel spectrogram (in .npy format)") parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument( - "--device", type=str, default="cpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") parser.add_argument( "--opts", nargs=argparse.REMAINDER, diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py index 9d1df13c..d500336a 100644 --- a/paddlespeech/t2s/exps/waveflow/train.py +++ b/paddlespeech/t2s/exps/waveflow/train.py @@ -139,8 +139,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/frontend/pinyin.py b/paddlespeech/t2s/frontend/pinyin.py deleted file mode 100644 index f99129ce..00000000 --- a/paddlespeech/t2s/frontend/pinyin.py +++ /dev/null @@ -1,333 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A Simple Chinese Phonology using pinyin symbols. -The G2P conversion converts pinyin string to symbols. Also it can handle string -in Chinese chracters, but due to the complexity of chinese G2P, we can leave -text -> pinyin to other part of a TTS system. Other NLP techniques may be used -(e.g. tokenization, tagging, NER...) -""" -import re -from itertools import product - -from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin -from pypinyin.core import DefaultConverter -from pypinyin.core import Pinyin -from pypinyin.core import Style - -from paddlespeech.t2s.frontend.phonectic import Phonetics -from paddlespeech.t2s.frontend.vocab import Vocab - -_punctuations = [',', '。', '?', '!'] -_initials = [ - 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh', - 'ch', 'sh', 'r', 'z', 'c', 's' -] -_finals = [ - 'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', - 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', - 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng', - 'v', 've', 'van', 'ven', 'veng' -] -_ernized_symbol = ['&r'] -_phones = _initials + _finals + _ernized_symbol + _punctuations -_tones = ['0', '1', '2', '3', '4', '5'] - -_toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])] -_toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations - - -class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter): - pass - - -class ParakeetPinyin(Phonetics): - def __init__(self): - self.vocab_phonemes = Vocab(_phones) - self.vocab_tones = Vocab(_tones) - self.pinyin_backend = Pinyin(ParakeetConverter()) - - def convert_pypinyin_tone3(self, syllables, add_start_end=False): - phonemes, tones = _convert_to_parakeet_style_pinyin(syllables) - - if add_start_end: - start = self.vocab_phonemes.start_symbol - end = self.vocab_phonemes.end_symbol - phonemes = [start] + phonemes + [end] - - start = self.vocab_tones.start_symbol - end = self.vocab_tones.end_symbol - phonemes = [start] + tones + [end] - - phonemes = [ - item for item in phonemes if item in self.vocab_phonemes.stoi - ] - tones = [item for item in tones if item in self.vocab_tones.stoi] - return phonemes, tones - - def phoneticize(self, sentence, add_start_end=False): - """ Normalize the input text sequence and convert it into pronunciation sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. - """ - syllables = self.pinyin_backend.lazy_pinyin( - sentence, style=Style.TONE3, strict=True) - phonemes, tones = self.convert_pypinyin_tone3( - syllables, add_start_end=add_start_end) - return phonemes, tones - - def numericalize(self, phonemes, tones): - """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - - Returns - ---------- - List[int] - The list of pronunciation id sequence. - """ - phoneme_ids = [self.vocab_phonemes.lookup(item) for item in phonemes] - tone_ids = [self.vocab_tones.lookup(item) for item in tones] - return phoneme_ids, tone_ids - - def __call__(self, sentence, add_start_end=False): - """ Convert the input text sequence into pronunciation id sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. - """ - phonemes, tones = self.phoneticize( - sentence, add_start_end=add_start_end) - phoneme_ids, tone_ids = self.numericalize(phonemes, tones) - return phoneme_ids, tone_ids - - @property - def vocab_size(self): - """ Vocab size. - """ - # 70 = 62 phones + 4 punctuations + 4 special tokens - return len(self.vocab_phonemes) - - @property - def tone_vocab_size(self): - # 10 = 1 non tone + 5 tone + 4 special tokens - return len(self.vocab_tones) - - -class ParakeetPinyinWithTone(Phonetics): - def __init__(self): - self.vocab = Vocab(_toned_phonems) - self.pinyin_backend = Pinyin(ParakeetConverter()) - - def convert_pypinyin_tone3(self, syllables, add_start_end=False): - phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables) - - if add_start_end: - start = self.vocab_phonemes.start_symbol - end = self.vocab_phonemes.end_symbol - phonemes = [start] + phonemes + [end] - - phonemes = [item for item in phonemes if item in self.vocab.stoi] - return phonemes - - def phoneticize(self, sentence, add_start_end=False): - """ Normalize the input text sequence and convert it into pronunciation sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. - """ - syllables = self.pinyin_backend.lazy_pinyin( - sentence, style=Style.TONE3, strict=True) - phonemes = self.convert_pypinyin_tone3( - syllables, add_start_end=add_start_end) - return phonemes - - def numericalize(self, phonemes): - """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - - Returns - ---------- - List[int] - The list of pronunciation id sequence. - """ - phoneme_ids = [self.vocab.lookup(item) for item in phonemes] - return phoneme_ids - - def __call__(self, sentence, add_start_end=False): - """ Convert the input text sequence into pronunciation id sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. - """ - phonemes = self.phoneticize(sentence, add_start_end=add_start_end) - phoneme_ids = self.numericalize(phonemes) - return phoneme_ids - - @property - def vocab_size(self): - """ Vocab size. - """ - # 230 = 222 phones + 4 punctuations + 4 special tokens - return len(self.vocab) - - -def _convert_to_parakeet_convension(syllable): - # from pypinyin.Style.TONE3 to parakeet convension - tone = syllable[-1] - syllable = syllable[:-1] - - # expansion of o -> uo - syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable) - - # expansion for iong, ong - syllable = syllable.replace("iong", "veng").replace("ong", "ueng") - - # expansion for ing, in - syllable = syllable.replace("ing", "ieng").replace("in", "ien") - - # expansion for un, ui, iu - syllable = syllable.replace("un", "uen") \ - .replace("ui", "uei") \ - .replace("iu", "iou") - - # rule for variants of i - syllable = syllable.replace("zi", "zii") \ - .replace("ci", "cii") \ - .replace("si", "sii") \ - .replace("zhi", "zhiii") \ - .replace("chi", "chiii") \ - .replace("shi", "shiii") \ - .replace("ri", "riii") - - # rule for y preceding i, u - syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i") - - # rule for w - syllable = syllable.replace("wu", "u").replace("w", "u") - - # rule for v following j, q, x - syllable = syllable.replace("ju", "jv") \ - .replace("qu", "qv") \ - .replace("xu", "xv") - - return syllable + tone - - -def _split_syllable(syllable: str): - global _punctuations - - if syllable in _punctuations: - # syllables, tones - return [syllable], ['0'] - - syllable = _convert_to_parakeet_convension(syllable) - - tone = syllable[-1] - syllable = syllable[:-1] - - phones = [] - tones = [] - - global _initials - if syllable[:2] in _initials: - phones.append(syllable[:2]) - tones.append('0') - phones.append(syllable[2:]) - tones.append(tone) - elif syllable[0] in _initials: - phones.append(syllable[0]) - tones.append('0') - phones.append(syllable[1:]) - tones.append(tone) - else: - phones.append(syllable) - tones.append(tone) - return phones, tones - - -def _convert_to_parakeet_style_pinyin(syllables): - phones, tones = [], [] - for syllable in syllables: - p, t = _split_syllable(syllable) - phones.extend(p) - tones.extend(t) - return phones, tones - - -def _split_syllable_with_tone(syllable: str): - global _punctuations - - if syllable in _punctuations: - # syllables - return [syllable] - - syllable = _convert_to_parakeet_convension(syllable) - - phones = [] - - global _initials - if syllable[:2] in _initials: - phones.append(syllable[:2]) - phones.append(syllable[2:]) - elif syllable[0] in _initials: - phones.append(syllable[0]) - phones.append(syllable[1:]) - else: - phones.append(syllable) - return phones - - -def _convert_to_parakeet_style_pinyin_with_tone(syllables): - phones = [] - for syllable in syllables: - p = _split_syllable_with_tone(syllable) - phones.extend(p) - return phones diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 33bf5ab2..d49c0937 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -219,18 +219,45 @@ class Frontend(): def get_phonemes(self, sentence: str, merge_sentences: bool=True, - with_erhua: bool=True) -> List[List[str]]: + with_erhua: bool=True, + robot: bool=False, + print_info: bool=False) -> List[List[str]]: sentences = self.text_normalizer.normalize(sentence) phonemes = self._g2p( sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) + # change all tones to `1` + if robot: + new_phonemes = [] + for sentence in phonemes: + new_sentence = [] + for item in sentence: + # `er` only have tone `2` + if item[-1] in "12345" and item != "er2": + item = item[:-1] + "1" + new_sentence.append(item) + new_phonemes.append(new_sentence) + phonemes = new_phonemes + if print_info: + print("----------------------------") + print("text norm results:") + print(sentences) + print("----------------------------") + print("g2p results:") + print(phonemes) + print("----------------------------") return phonemes - def get_input_ids( - self, - sentence: str, - merge_sentences: bool=True, - get_tone_ids: bool=False) -> Dict[str, List[paddle.Tensor]]: - phonemes = self.get_phonemes(sentence, merge_sentences=merge_sentences) + def get_input_ids(self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False) -> Dict[str, List[paddle.Tensor]]: + phonemes = self.get_phonemes( + sentence, + merge_sentences=merge_sentences, + print_info=print_info, + robot=robot) result = {} phones = [] tones = [] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 5942533a..cf957978 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -16,7 +16,9 @@ from typing import Dict from typing import Sequence from typing import Tuple +from typing import Union +import numpy as np import paddle import paddle.nn.functional as F from paddle import nn @@ -96,7 +98,7 @@ class FastSpeech2(nn.Layer): pitch_embed_dropout: float=0.5, stop_gradient_from_pitch_predictor: bool=False, # spk emb - num_speakers: int=None, + spk_num: int=None, spk_embed_dim: int=None, spk_embed_integration_type: str="add", # tone emb @@ -146,9 +148,9 @@ class FastSpeech2(nn.Layer): # initialize parameters initialize(self, init_type) - if self.spk_embed_dim is not None: + if spk_num and self.spk_embed_dim: self.spk_embedding_table = nn.Embedding( - num_embeddings=num_speakers, + num_embeddings=spk_num, embedding_dim=self.spk_embed_dim, padding_idx=self.padding_idx) @@ -297,7 +299,7 @@ class FastSpeech2(nn.Layer): pitch: paddle.Tensor, energy: paddle.Tensor, tone_id: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, spk_id: paddle.Tensor=None ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -320,7 +322,7 @@ class FastSpeech2(nn.Layer): Batch of padded token-averaged energy (B, Tmax, 1). tone_id : Tensor, optional(int64) Batch of padded tone ids (B, Tmax). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). spk_id : Tnesor, optional(int64) Batch of speaker ids (B,) @@ -364,7 +366,7 @@ class FastSpeech2(nn.Layer): ps, es, is_inference=False, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) # modify mod part of groundtruth @@ -385,7 +387,7 @@ class FastSpeech2(nn.Layer): es: paddle.Tensor=None, is_inference: bool=False, alpha: float=1.0, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None) -> Sequence[paddle.Tensor]: # forward encoder @@ -395,11 +397,12 @@ class FastSpeech2(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - if spembs is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) elif spk_id is not None: - spembs = self.spk_embedding_table(spk_id) - hs = self._integrate_with_spk_embed(hs, spembs) + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) # integrate tone embedding if self.tone_embed_dim is not None: @@ -487,7 +490,7 @@ class FastSpeech2(nn.Layer): energy: paddle.Tensor=None, alpha: float=1.0, use_teacher_forcing: bool=False, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: @@ -510,12 +513,12 @@ class FastSpeech2(nn.Layer): use_teacher_forcing : bool, optional Whether to use teacher forcing. If true, groundtruth of duration, pitch and energy will be used. - spembs : Tensor, optional + spk_emb : Tensor, optional peaker embedding vector (spk_embed_dim,). spk_id : Tensor, optional(int64) - Speaker embedding vector (spk_embed_dim). + Batch of padded spk ids (1,). tone_id : Tensor, optional(int64) - Batch of padded tone ids (B, Tmax). + Batch of padded tone ids (T,). Returns ---------- @@ -525,10 +528,7 @@ class FastSpeech2(nn.Layer): # input of embedding must be int64 x = paddle.cast(text, 'int64') y = speech - spemb = spembs - if durations is not None: - d = paddle.cast(durations, 'int64') - p, e = pitch, energy + d, p, e = durations, pitch, energy # setup batch axis ilens = paddle.shape(x)[0] @@ -537,51 +537,51 @@ class FastSpeech2(nn.Layer): if y is not None: ys = y.unsqueeze(0) - if spemb is not None: - spembs = spemb.unsqueeze(0) - else: - spembs = None + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + if tone_id is not None: + tone_id = tone_id.unsqueeze(0) if use_teacher_forcing: # use groundtruth of duration, pitch, and energy ds = d.unsqueeze(0) if d is not None else None ps = p.unsqueeze(0) if p is not None else None es = e.unsqueeze(0) if e is not None else None - # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0) + # (1, L, odim) - _, outs, d_outs, *_ = self._forward( + _, outs, d_outs, p_outs, e_outs = self._forward( xs, ilens, ys, ds=ds, ps=ps, es=es, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id, is_inference=True) else: # (1, L, odim) - _, outs, d_outs, *_ = self._forward( + _, outs, d_outs, p_outs, e_outs = self._forward( xs, ilens, ys, is_inference=True, alpha=alpha, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) + return outs[0], d_outs[0], p_outs[0], e_outs[0] - return outs[0] - - def _integrate_with_spk_embed(self, hs, spembs): + def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -591,13 +591,13 @@ class FastSpeech2(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.spk_projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.spk_projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand( + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, hs.shape[1], -1]) - hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1)) + hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") @@ -682,8 +682,132 @@ class FastSpeech2Inference(nn.Layer): self.normalizer = normalizer self.acoustic_model = model - def forward(self, text, spk_id=None): - normalized_mel = self.acoustic_model.inference(text, spk_id=spk_id) + def forward(self, text, spk_id=None, spk_emb=None): + normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( + text, spk_id=spk_id, spk_emb=spk_emb) + logmel = self.normalizer.inverse(normalized_mel) + return logmel + + +class StyleFastSpeech2Inference(FastSpeech2Inference): + def __init__(self, + normalizer, + model, + pitch_stats_path=None, + energy_stats_path=None): + super().__init__(normalizer, model) + if pitch_stats_path: + pitch_mean, pitch_std = np.load(pitch_stats_path) + self.pitch_mean = paddle.to_tensor(pitch_mean) + self.pitch_std = paddle.to_tensor(pitch_std) + if energy_stats_path: + energy_mean, energy_std = np.load(energy_stats_path) + self.energy_mean = paddle.to_tensor(energy_mean) + self.energy_std = paddle.to_tensor(energy_std) + + def denorm(self, data, mean, std): + return data * std + mean + + def norm(self, data, mean, std): + return (data - mean) / std + + def forward(self, + text: paddle.Tensor, + durations: Union[paddle.Tensor, np.ndarray]=None, + durations_scale: Union[int, float]=None, + durations_bias: Union[int, float]=None, + pitch: Union[paddle.Tensor, np.ndarray]=None, + pitch_scale: Union[int, float]=None, + pitch_bias: Union[int, float]=None, + energy: Union[paddle.Tensor, np.ndarray]=None, + energy_scale: Union[int, float]=None, + energy_bias: Union[int, float]=None, + robot: bool=False): + """ + Parameters + ---------- + text : Tensor(int64) + Input sequence of characters (T,). + speech : Tensor, optional + Feature sequence to extract style (N, idim). + durations : paddle.Tensor/np.ndarray, optional (int64) + Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + durations_scale: int/float, optional + durations_bias: int/float, optional + pitch : paddle.Tensor/np.ndarray, optional + Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale: int/float, optional + In denormed HZ domain. + pitch_bias: int/float, optional + In denormed HZ domain. + energy : paddle.Tensor/np.ndarray, optional + Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale: int/float, optional + In denormed domain. + energy_bias: int/float, optional + In denormed domain. + robot : bool, optional + Weather output robot style + Returns + ---------- + Tensor + Output sequence of features (L, odim). + """ + normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( + text, durations=None, pitch=None, energy=None) + # priority: groundtruth > scale/bias > previous output + # set durations + if isinstance(durations, np.ndarray): + durations = paddle.to_tensor(durations) + elif isinstance(durations, paddle.Tensor): + durations = durations + elif durations_scale or durations_bias: + durations_scale = durations_scale if durations_scale is not None else 1 + durations_bias = durations_bias if durations_bias is not None else 0 + durations = durations_scale * d_outs + durations_bias + else: + durations = d_outs + + if robot: + # set normed pitch to zeros have the same effect with set denormd ones to mean + pitch = paddle.zeros(p_outs.shape) + + # set pitch, can overwrite robot set + if isinstance(pitch, np.ndarray): + pitch = paddle.to_tensor(pitch) + elif isinstance(pitch, paddle.Tensor): + pitch = pitch + elif pitch_scale or pitch_bias: + pitch_scale = pitch_scale if pitch_scale is not None else 1 + pitch_bias = pitch_bias if pitch_bias is not None else 0 + p_Hz = paddle.exp( + self.denorm(p_outs, self.pitch_mean, self.pitch_std)) + p_HZ = pitch_scale * p_Hz + pitch_bias + pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std) + else: + pitch = p_outs + + # set energy + if isinstance(energy, np.ndarray): + energy = paddle.to_tensor(energy) + elif isinstance(energy, paddle.Tensor): + energy = energy + elif energy_scale or energy_bias: + energy_scale = energy_scale if energy_scale is not None else 1 + energy_bias = energy_bias if energy_bias is not None else 0 + e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std) + e_dnorm = energy_scale * e_dnorm + energy_bias + energy = self.norm(e_dnorm, self.energy_mean, self.energy_std) + else: + energy = e_outs + + normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( + text, + durations=durations, + pitch=pitch, + energy=energy, + use_teacher_forcing=True) + logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 4297c8b6..0dabf934 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + # No explicit speaker identifier labels are used during voice cloning training. + if spk_emb is not None: + spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater): durations=batch["durations"], pitch=batch["pitch"], energy=batch["energy"], - spk_id=spk_id) + spk_id=spk_id, + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, @@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator): durations=batch["durations"], pitch=batch["pitch"], energy=batch["energy"], - spk_id=spk_id) + spk_id=spk_id, + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 97233c76..5958a166 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -391,7 +391,7 @@ class TransformerTTS(nn.Layer): text_lengths: paddle.Tensor, speech: paddle.Tensor, speech_lengths: paddle.Tensor, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -405,7 +405,7 @@ class TransformerTTS(nn.Layer): Batch of padded target features (B, Lmax, odim). speech_lengths : Tensor(int64) Batch of the lengths of each target (B,). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -439,7 +439,7 @@ class TransformerTTS(nn.Layer): # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, - spembs) + spk_emb) # modifiy mod part of groundtruth @@ -467,7 +467,7 @@ class TransformerTTS(nn.Layer): ilens: paddle.Tensor, ys: paddle.Tensor, olens: paddle.Tensor, - spembs: paddle.Tensor, + spk_emb: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: # forward encoder x_masks = self._source_mask(ilens) @@ -480,7 +480,7 @@ class TransformerTTS(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + hs = self._integrate_with_spk_embed(hs, spk_emb) # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: @@ -514,7 +514,7 @@ class TransformerTTS(nn.Layer): self, text: paddle.Tensor, speech: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, threshold: float=0.5, minlenratio: float=0.0, maxlenratio: float=10.0, @@ -528,7 +528,7 @@ class TransformerTTS(nn.Layer): Input sequence of characters (T,). speech : Tensor, optional Feature sequence to extract style (N, idim). - spembs : Tensor, optional + spk_emb : Tensor, optional Speaker embedding vector (spk_embed_dim,). threshold : float, optional Threshold in inference. @@ -551,7 +551,6 @@ class TransformerTTS(nn.Layer): """ # input of embedding must be int64 y = speech - spemb = spembs # add eos at the last of sequence text = numpy.pad( @@ -564,12 +563,12 @@ class TransformerTTS(nn.Layer): # get teacher forcing outputs xs, ys = x.unsqueeze(0), y.unsqueeze(0) - spembs = None if spemb is None else spemb.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) ilens = paddle.to_tensor( [xs.shape[1]], dtype=paddle.int64, place=xs.place) olens = paddle.to_tensor( [ys.shape[1]], dtype=paddle.int64, place=ys.place) - outs, *_ = self._forward(xs, ilens, ys, olens, spembs) + outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb) # get attention weights att_ws = [] @@ -590,9 +589,9 @@ class TransformerTTS(nn.Layer): hs = hs + style_embs.unsqueeze(1) # integrate speaker embedding - if self.spk_embed_dim is not None: - spembs = spemb.unsqueeze(0) - hs = self._integrate_with_spk_embed(hs, spembs) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + hs = self._integrate_with_spk_embed(hs, spk_emb) # set limits of length maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor) @@ -726,14 +725,14 @@ class TransformerTTS(nn.Layer): def _integrate_with_spk_embed(self, hs: paddle.Tensor, - spembs: paddle.Tensor) -> paddle.Tensor: + spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -744,13 +743,13 @@ class TransformerTTS(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1], - -1) - hs = self.projection(paddle.concat([hs, spembs], axis=-1)) + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1], + -1) + hs = self.projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py index 62d7a96f..f91c76b7 100644 --- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py +++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) -import logging - from paddle import nn from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention @@ -122,7 +120,6 @@ class Encoder(nn.Layer): "rel_selfattn", "legacy_rel_selfattn", ]: - logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = [ (attention_heads, attention_dim, attention_dropout_rate, ) diff --git a/paddlespeech/t2s/training/cli.py b/paddlespeech/t2s/training/cli.py index 3b9fd42e..a0710fd7 100644 --- a/paddlespeech/t2s/training/cli.py +++ b/paddlespeech/t2s/training/cli.py @@ -53,8 +53,7 @@ def default_argument_parser(): parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") # running - parser.add_argument("--device", type=str, choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.") - parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.") + parser.add_argument("--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") # overwrite extra config and default config parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index 7a6a7e99..c9e7f4cc 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -107,7 +107,12 @@ class ExperimentBase(object): def setup(self): """Setup the experiment. """ - paddle.set_device(self.args.device) + if self.args.ngpu == 0: + paddle.set_device("cpu") + elif self.args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") if self.parallel: self.init_parallel() @@ -128,7 +133,7 @@ class ExperimentBase(object): """A flag indicating whether the experiment should run with multiprocessing. """ - return self.args.device == "gpu" and self.args.nprocs > 1 + return self.args.ngpu > 1 def init_parallel(self): """Init environment for multiprocess training. diff --git a/paddlespeech/vector/__init__.py b/paddlespeech/vector/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/vector/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/ge2e/__init__.py b/paddlespeech/vector/exps/__init__.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/__init__.py rename to paddlespeech/vector/exps/__init__.py diff --git a/paddlespeech/vector/exps/ge2e/__init__.py b/paddlespeech/vector/exps/ge2e/__init__.py new file mode 100644 index 00000000..abf198b9 --- /dev/null +++ b/paddlespeech/vector/exps/ge2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/audio_processor.py rename to paddlespeech/vector/exps/ge2e/audio_processor.py diff --git a/paddlespeech/t2s/exps/ge2e/config.py b/paddlespeech/vector/exps/ge2e/config.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/config.py rename to paddlespeech/vector/exps/ge2e/config.py diff --git a/paddlespeech/t2s/exps/ge2e/dataset_processors.py b/paddlespeech/vector/exps/ge2e/dataset_processors.py similarity index 98% rename from paddlespeech/t2s/exps/ge2e/dataset_processors.py rename to paddlespeech/vector/exps/ge2e/dataset_processors.py index a9320d98..908c852b 100644 --- a/paddlespeech/t2s/exps/ge2e/dataset_processors.py +++ b/paddlespeech/vector/exps/ge2e/dataset_processors.py @@ -19,7 +19,7 @@ from typing import List import numpy as np from tqdm import tqdm -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor): diff --git a/paddlespeech/t2s/exps/ge2e/inference.py b/paddlespeech/vector/exps/ge2e/inference.py similarity index 90% rename from paddlespeech/t2s/exps/ge2e/inference.py rename to paddlespeech/vector/exps/ge2e/inference.py index a5733941..7660de5e 100644 --- a/paddlespeech/t2s/exps/ge2e/inference.py +++ b/paddlespeech/vector/exps/ge2e/inference.py @@ -18,9 +18,9 @@ import numpy as np import paddle import tqdm -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder def embed_utterance(processor, model, fpath_or_wav): @@ -51,7 +51,13 @@ def _process_utterance(ifpath: Path, def main(config, args): - paddle.set_device(args.device) + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") # load model model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers, @@ -112,13 +118,6 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, help="path of the checkpoint to load") - # running - parser.add_argument( - "--device", - type=str, - choices=["cpu", "gpu"], - help="device type to use, cpu and gpu are supported.") - # overwrite extra config and default config parser.add_argument( "--opts", @@ -126,6 +125,9 @@ if __name__ == "__main__": help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + args = parser.parse_args() if args.config: config.merge_from_file(args.config) diff --git a/paddlespeech/t2s/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py similarity index 87% rename from paddlespeech/t2s/exps/ge2e/preprocess.py rename to paddlespeech/vector/exps/ge2e/preprocess.py index 604ff0c6..dabe0ce7 100644 --- a/paddlespeech/t2s/exps/ge2e/preprocess.py +++ b/paddlespeech/vector/exps/ge2e/preprocess.py @@ -14,14 +14,13 @@ import argparse from pathlib import Path -from audio_processor import SpeakerVerificationPreprocessor - -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_aidatatang_200zh -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_librispeech -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_magicdata -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_voxceleb1 -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_voxceleb2 +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults +from paddlespeech.vector.exps.ge2e.dataset_processors import process_aidatatang_200zh +from paddlespeech.vector.exps.ge2e.dataset_processors import process_librispeech +from paddlespeech.vector.exps.ge2e.dataset_processors import process_magicdata +from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb1 +from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb2 if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/paddlespeech/t2s/exps/ge2e/random_cycle.py b/paddlespeech/vector/exps/ge2e/random_cycle.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/random_cycle.py rename to paddlespeech/vector/exps/ge2e/random_cycle.py diff --git a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py similarity index 98% rename from paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py rename to paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py index a1321996..194eb7f2 100644 --- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py +++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py @@ -18,7 +18,7 @@ import numpy as np from paddle.io import BatchSampler from paddle.io import Dataset -from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle +from paddlespeech.vector.exps.ge2e.random_cycle import random_cycle class MultiSpeakerMelDataset(Dataset): diff --git a/paddlespeech/t2s/exps/ge2e/train.py b/paddlespeech/vector/exps/ge2e/train.py similarity index 88% rename from paddlespeech/t2s/exps/ge2e/train.py rename to paddlespeech/vector/exps/ge2e/train.py index d3a57c93..bf1cf107 100644 --- a/paddlespeech/t2s/exps/ge2e/train.py +++ b/paddlespeech/vector/exps/ge2e/train.py @@ -19,13 +19,13 @@ from paddle.io import DataLoader from paddle.nn.clip import ClipGradByGlobalNorm from paddle.optimizer import Adam -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder from paddlespeech.t2s.training import default_argument_parser from paddlespeech.t2s.training import ExperimentBase +from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults +from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import Collate +from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset +from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder class Ge2eExperiment(ExperimentBase): @@ -102,8 +102,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/vector/models/__init__.py b/paddlespeech/vector/models/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/vector/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/models/lstm_speaker_encoder.py b/paddlespeech/vector/models/lstm_speaker_encoder.py similarity index 100% rename from paddlespeech/t2s/models/lstm_speaker_encoder.py rename to paddlespeech/vector/models/lstm_speaker_encoder.py diff --git a/requirements.txt b/requirements.txt index 08f68449..99e485f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,11 @@ ConfigArgParse coverage +distro editdistance g2p_en g2pM gpustat +GPUtil h5py inflect jieba @@ -16,30 +18,35 @@ matplotlib nara_wpe nltk numba -numpy==1.20.0 +paddlespeech_ctcdecoders +paddlespeech_feat pandas phkit Pillow praatio~=4.1 pre-commit +psutil pybind11 +pynvml +pypi-kenlm pypinyin python-dateutil pyworld resampy==0.2.2 sacrebleu -scipy==1.2.1 -sentencepiece +scipy +sentencepiece~=0.1.96 snakeviz soundfile~=0.10 sox +soxbindings tensorboardX textgrid timer tqdm typeguard unidecode -visualdl==2.2.0 +visualdl webrtcvad yacs yq diff --git a/setup.py b/setup.py index 29583e8c..310eed1e 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,6 @@ def _post_install(install_lib_dir): print(os.getcwd()) check_call("./install_autolog.sh") print("autolog install.") - # ctcdecoder ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig' with pushd(ctcdecoder_dir): @@ -102,11 +101,9 @@ class DevelopCommand(develop): class InstallCommand(install): def run(self): install.run(self) - # must after install.run, or pkg install by shell will not see - self.execute(_post_install, (self.install_lib, ), msg="Post Install...") -# cmd: python setup.py upload + # cmd: python setup.py upload class UploadCommand(Command): description = "Build and publish the package." user_options = [] @@ -133,11 +130,11 @@ class UploadCommand(Command): setup_info = dict( # Metadata name='paddlespeech', - version='2.1.2', - author='PaddleSL Speech Team', - author_email='', - url='https://github.com/PaddlePaddle/DeepSpeech', - license='Apache 2', + version='0.0.1a', + author='PaddlePaddle Speech and Language Team', + author_email='paddlesl@baidu.com', + url='https://github.com/PaddlePaddle/PaddleSpeech', + license='Apache 2.0', description='Speech tools and models based on Paddlepaddle', long_description=read("README.md"), long_description_content_type="text/markdown", @@ -175,11 +172,11 @@ setup_info = dict( }, # Package info - packages=find_packages(exclude=('tests', 'tests.*', 'examples*', + packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*', 'paddleaudio*', 'third_party*', 'tools*')), zip_safe=True, classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Artificial Intelligence', @@ -189,6 +186,7 @@ setup_info = dict( 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], ) setup(**setup_info) diff --git a/setup.sh b/setup.sh new file mode 100644 index 00000000..0bfacb54 --- /dev/null +++ b/setup.sh @@ -0,0 +1,20 @@ +# Install conda dependencies +conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes + +# Install the python lib +pip install -r requirements.txt + +# Install the auto_log +pushd tools/extras +bash install_autolog.sh +popd + +# Install the ctcdecoder +pushd paddlespeech/s2t/decoders/ctcdecoder/swig +bash -e setup.sh +popd + +# Install the python_speech_features +pushd third_party +bash -e install.sh +popd diff --git a/speechnn/.gitignore b/speechnn/.gitignore deleted file mode 100644 index 378eac25..00000000 --- a/speechnn/.gitignore +++ /dev/null @@ -1 +0,0 @@ -build diff --git a/speechnn/CMakeLists.txt b/speechnn/CMakeLists.txt deleted file mode 100644 index 88182eb4..00000000 --- a/speechnn/CMakeLists.txt +++ /dev/null @@ -1,56 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -project(speechnn VERSION 0.1) - -if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR}/src CACHE PATH "Install path prefix." FORCE) -endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) -set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}") - -# include file -include(cmake/third_party.cmake) - - -set(CMAKE_VERBOSE_MAKEFILE on) -# set std-14 -set(CMAKE_CXX_STANDARD 14) - - -# # fc_patch dir -# set(FETCHCONTENT_QUIET off) -# get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") -# set(FETCHCONTENT_BASE_DIR ${fc_patch}) -# -# -# ############################################################################### -# # Option Configurations -# ############################################################################### -# # option configurations -# option(TEST_DEBUG "option for debug" OFF) -# -# -# ############################################################################### -# # Add local library -# ############################################################################### -# # system lib -# find_package() -# # if dir have CmakeLists.txt -# add_subdirectory() -# # if dir do not have CmakeLists.txt -# add_library(lib_name STATIC file.cc) -# target_link_libraries(lib_name item0 item1) -# add_dependencies(lib_name depend-target) -# -# -# ############################################################################### -# # Library installation -# ############################################################################### -# install() -# -# -# ############################################################################### -# # Build binary file -# ############################################################################### -# add_executable() -# target_link_libraries() -# diff --git a/speechnn/cmake/third_party.cmake b/speechnn/cmake/third_party.cmake deleted file mode 100644 index fdd7b53c..00000000 --- a/speechnn/cmake/third_party.cmake +++ /dev/null @@ -1,197 +0,0 @@ -include(ExternalProject) -# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac) - -set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING - "A path setting third party libraries download & build directories.") -set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING - "A path cache third party source code to avoid repeated download.") - -set(THIRD_PARTY_BUILD_TYPE Release) -set(third_party_deps) - - -# cache funciton to avoid repeat download code of third_party. -# This function has 4 parameters, URL / REPOSITOR / TAG / DIR: -# 1. URL: specify download url of 3rd party -# 2. REPOSITORY: specify git REPOSITORY of 3rd party -# 3. TAG: specify git tag/branch/commitID of 3rd party -# 4. DIR: overwrite the original SOURCE_DIR when cache directory -# -# The function Return 1 PARENT_SCOPE variables: -# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, -# and you no longer need to set any donwnload steps in ExternalProject_Add. -# For example: -# Cache_third_party(${TARGET} -# REPOSITORY ${TARGET_REPOSITORY} -# TAG ${TARGET_TAG} -# DIR ${TARGET_SOURCE_DIR}) - -FUNCTION(cache_third_party TARGET) - SET(options "") - SET(oneValueArgs URL REPOSITORY TAG DIR) - SET(multiValueArgs "") - cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - STRING(REPLACE "extern_" "" TARGET_NAME ${TARGET}) - STRING(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME}) - STRING(TOUPPER ${TARGET_NAME} TARGET_NAME) - IF(cache_third_party_REPOSITORY) - SET(${TARGET_NAME}_DOWNLOAD_CMD - GIT_REPOSITORY ${cache_third_party_REPOSITORY}) - IF(cache_third_party_TAG) - LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD - GIT_TAG ${cache_third_party_TAG}) - ENDIF() - ELSEIF(cache_third_party_URL) - SET(${TARGET_NAME}_DOWNLOAD_CMD - URL ${cache_third_party_URL}) - ELSE() - MESSAGE(FATAL_ERROR "Download link (Git repo or URL) must be specified for cache!") - ENDIF() - IF(WITH_TP_CACHE) - IF(NOT cache_third_party_DIR) - MESSAGE(FATAL_ERROR "Please input the ${TARGET_NAME}_SOURCE_DIR for overwriting when -DWITH_TP_CACHE=ON") - ENDIF() - # Generate and verify cache dir for third_party source code - SET(cache_third_party_REPOSITORY ${cache_third_party_REPOSITORY} ${cache_third_party_URL}) - IF(cache_third_party_REPOSITORY AND cache_third_party_TAG) - STRING(MD5 HASH_REPO ${cache_third_party_REPOSITORY}) - STRING(MD5 HASH_GIT ${cache_third_party_TAG}) - STRING(SUBSTRING ${HASH_REPO} 0 8 HASH_REPO) - STRING(SUBSTRING ${HASH_GIT} 0 8 HASH_GIT) - STRING(CONCAT HASH ${HASH_REPO} ${HASH_GIT}) - # overwrite the original SOURCE_DIR when cache directory - SET(${cache_third_party_DIR} ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH}) - ELSEIF(cache_third_party_REPOSITORY) - STRING(MD5 HASH_REPO ${cache_third_party_REPOSITORY}) - STRING(SUBSTRING ${HASH_REPO} 0 16 HASH) - # overwrite the original SOURCE_DIR when cache directory - SET(${cache_third_party_DIR} ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH}) - ENDIF() - - IF(EXISTS ${${cache_third_party_DIR}}) - # judge whether the cache dir is empty - FILE(GLOB files ${${cache_third_party_DIR}}/*) - LIST(LENGTH files files_len) - IF(files_len GREATER 0) - list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD DOWNLOAD_COMMAND "") - ENDIF() - ENDIF() - SET(${cache_third_party_DIR} ${${cache_third_party_DIR}} PARENT_SCOPE) - ENDIF() - - # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks can't be removed - SET(${TARGET_NAME}_DOWNLOAD_CMD "${${TARGET_NAME}_DOWNLOAD_CMD}" PARENT_SCOPE) -ENDFUNCTION() - -MACRO(UNSET_VAR VAR_NAME) - UNSET(${VAR_NAME} CACHE) - UNSET(${VAR_NAME}) -ENDMACRO() - -# Funciton to Download the dependencies during compilation -# This function has 2 parameters, URL / DIRNAME: -# 1. URL: The download url of 3rd dependencies -# 2. NAME: The name of file, that determin the dirname -# -FUNCTION(file_download_and_uncompress URL NAME) - set(options "") - set(oneValueArgs MD5) - set(multiValueArgs "") - cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}") - SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE) - ExternalProject_Add( - download_${NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${THIRD_PARTY_PATH}/${NAME} - URL ${URL} - URL_MD5 ${URL_MD5} - TIMEOUT 120 - DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ - SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) - set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE) -ENDFUNCTION() - - -# Correction of flags on different Platform(WIN/MAC) and Print Warning Message -if (APPLE) - if(WITH_MKL) - MESSAGE(WARNING - "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.") - set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) - endif() -endif() - -if(WIN32 OR APPLE) - MESSAGE(STATUS "Disable XBYAK in Windows and MacOS") - SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) - - if(WITH_LIBXSMM) - MESSAGE(WARNING - "Windows, Mac are not supported with libxsmm in Paddle yet." - "Force WITH_LIBXSMM=OFF") - SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) - endif() - - if(WITH_BOX_PS) - MESSAGE(WARNING - "Windows or Mac is not supported with BOX_PS in Paddle yet." - "Force WITH_BOX_PS=OFF") - SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE) - endif() - - if(WITH_PSLIB) - MESSAGE(WARNING - "Windows or Mac is not supported with PSLIB in Paddle yet." - "Force WITH_PSLIB=OFF") - SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) - endif() - - if(WITH_LIBMCT) - MESSAGE(WARNING - "Windows or Mac is not supported with LIBMCT in Paddle yet." - "Force WITH_LIBMCT=OFF") - SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) - endif() - - if(WITH_PSLIB_BRPC) - MESSAGE(WARNING - "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." - "Force WITH_PSLIB_BRPC=OFF") - SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) - endif() -endif() - -set(WITH_MKLML ${WITH_MKL}) -if(NOT DEFINED WITH_MKLDNN) - if(WITH_MKL AND AVX2_FOUND) - set(WITH_MKLDNN ON) - else() - message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") - set(WITH_MKLDNN OFF) - endif() -endif() - -if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER) - set(WITH_DGC OFF) -endif() - -if(${CMAKE_VERSION} VERSION_GREATER "3.5.2") - set(SHALLOW_CLONE "GIT_SHALLOW TRUE") # adds --depth=1 arg to git clone of External_Projects -endif() - - -########################### include third_party according to flags ############################### -include(third_party/libsndfile) # download, build, install libsndfile -include(third_party/boost) # download boost -include(third_party/eigen) # download eigen3 -include(third_party/threadpool) # download threadpool - - diff --git a/speechnn/cmake/third_party/absl.cmake b/speechnn/cmake/third_party/absl.cmake deleted file mode 100644 index c2a8eceb..00000000 --- a/speechnn/cmake/third_party/absl.cmake +++ /dev/null @@ -1,13 +0,0 @@ -cmake_minimum_required(VERSION 3.14) -include(ExternalProject) -include(FetchContent) - -FetchContent_Declare( - absl - GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git" - GIT_TAG "20210324.1" -) - -FetchContent_MakeAvailable(absl) - - diff --git a/speechnn/cmake/third_party/boost.cmake b/speechnn/cmake/third_party/boost.cmake deleted file mode 100644 index eb0b2c15..00000000 --- a/speechnn/cmake/third_party/boost.cmake +++ /dev/null @@ -1,49 +0,0 @@ -include(ExternalProject) - -set(BOOST_PROJECT "extern_boost") -# To release PaddlePaddle as a pip package, we have to follow the -# manylinux1 standard, which features as old Linux kernels and -# compilers as possible and recommends CentOS 5. Indeed, the earliest -# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new -# version of boost, say, 1.66.0, doesn't build on CentOS 6. We -# checked that the devtools package of CentOS 6 installs boost 1.41.0. -# So we use 1.41.0 here. -set(BOOST_VER "1.41.0") -set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) -set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) - -MESSAGE(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}") - -set(BOOST_PREFIX_DIR ${THIRD_PARTY_PATH}/boost) -set(BOOST_SOURCE_DIR ${THIRD_PARTY_PATH}/boost/src/extern_boost) -cache_third_party(${BOOST_PROJECT} - URL ${BOOST_URL} - DIR BOOST_SOURCE_DIR) - -set(BOOST_INCLUDE_DIR "${BOOST_SOURCE_DIR}" CACHE PATH "boost include directory." FORCE) -set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) -include_directories(${BOOST_INCLUDE_DIR}) - -if(WIN32 AND MSVC_VERSION GREATER_EQUAL 1600) - add_definitions(-DBOOST_HAS_STATIC_ASSERT) -endif() - -ExternalProject_Add( - ${BOOST_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - "${BOOST_DOWNLOAD_CMD}" - URL_MD5 f891e8c2c9424f0565f0129ad9ab4aff - PREFIX ${BOOST_PREFIX_DIR} - DOWNLOAD_DIR ${BOOST_SOURCE_DIR} - SOURCE_DIR ${BOOST_SOURCE_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "" - ) - -add_library(boost INTERFACE) - -add_dependencies(boost ${BOOST_PROJECT}) -set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) diff --git a/speechnn/cmake/third_party/eigen.cmake b/speechnn/cmake/third_party/eigen.cmake deleted file mode 100644 index 6a032307..00000000 --- a/speechnn/cmake/third_party/eigen.cmake +++ /dev/null @@ -1,53 +0,0 @@ -include(ExternalProject) - -# update eigen to the commit id f612df27 on 03/16/2021 -set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) -set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) -set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git) -set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) - -cache_third_party(extern_eigen3 - REPOSITORY ${EIGEN_REPOSITORY} - TAG ${EIGEN_TAG} - DIR EIGEN_SOURCE_DIR) - -if(WIN32) - add_definitions(-DEIGEN_STRONG_INLINE=inline) -elseif(LINUX) - if(WITH_ROCM) - # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1) - set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1}) - endif() -endif() - -set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}) -INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) - -ExternalProject_Add( - extern_eigen3 - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${EIGEN_DOWNLOAD_CMD}" - PREFIX ${EIGEN_PREFIX_DIR} - SOURCE_DIR ${EIGEN_SOURCE_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND ${EIGEN_PATCH_COMMAND} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -add_library(eigen3 INTERFACE) - -add_dependencies(eigen3 extern_eigen3) - -# sw not support thread_local semantic -if(WITH_SW) - add_definitions(-DEIGEN_AVOID_THREAD_LOCAL) -endif() diff --git a/speechnn/cmake/third_party/libsndfile.cmake b/speechnn/cmake/third_party/libsndfile.cmake deleted file mode 100644 index 05d5c6ed..00000000 --- a/speechnn/cmake/third_party/libsndfile.cmake +++ /dev/null @@ -1,11 +0,0 @@ -cmake_minimum_required(VERSION 3.14) -include(ExternalProject) -include(FetchContent) - -FetchContent_Declare( - libsndfile - GIT_REPOSITORY https://github.com/libsndfile/libsndfile.git - GIT_TAG v1.0.30 # tag v1.0.30 -) - -FetchContent_GetProperties(libsndfile) diff --git a/speechnn/cmake/third_party/openfst.cmake b/speechnn/cmake/third_party/openfst.cmake deleted file mode 100644 index 39f335a1..00000000 --- a/speechnn/cmake/third_party/openfst.cmake +++ /dev/null @@ -1,26 +0,0 @@ -cmake_minimum_required(VERSION 3.14) -include(ExternalProject) -include(FetchContent) - -FetchContent_Declare( - openfst - GIT_REPOSITORY https://github.com/kkm000/openfst - GIT_TAG 338225416178ac36b8002d70387f5556e44c8d05 # tag win/1.7.2.1 -) - -FetchContent_GetProperties(openfst) -if(NOT openfst_POPULATED) - FetchContent_Populate(openfst) - include_directories(${openfst_SOURCE_DIR}/src/include) - - add_subdirectory(${openfst_SOURCE_DIR} ${openfst_BINARY_DIR}) - - install(DIRECTORY ${openfst_SOURCE_DIR}/src/include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - - install(TARGETS fst - EXPORT kaldi-targets - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -endif() diff --git a/speechnn/cmake/third_party/openfst_lib_target.cmake b/speechnn/cmake/third_party/openfst_lib_target.cmake deleted file mode 100644 index dde5efc4..00000000 --- a/speechnn/cmake/third_party/openfst_lib_target.cmake +++ /dev/null @@ -1,31 +0,0 @@ -if(NOT OPENFST_ROOT_DIR) - message(FATAL_ERROR) -endif() - -set(fst_source_dir ${OPENFST_ROOT_DIR}/src/lib) -set(fst_include_dir ${OPENFST_ROOT_DIR}/src/include) - -include_directories(${fst_include_dir}) -file(GLOB fst_sources "${fst_source_dir}/*.cc") - -add_library(fst ${fst_sources}) -target_include_directories(fst PUBLIC - $ - $ -) - -install(TARGETS fst - EXPORT kaldi-targets - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) - -install(DIRECTORY ${fst_include_dir}/fst - DESTINATION include/openfst - PATTERN "test/*.h" EXCLUDE -) - -unset(fst_source_dir) -unset(fst_include_dir) -unset(fst_sources) diff --git a/speechnn/cmake/third_party/threadpool.cmake b/speechnn/cmake/third_party/threadpool.cmake deleted file mode 100644 index d2c249e9..00000000 --- a/speechnn/cmake/third_party/threadpool.cmake +++ /dev/null @@ -1,36 +0,0 @@ -INCLUDE(ExternalProject) - -SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) -SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -if(WITH_ASCEND OR WITH_ASCEND_CL) - SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) -else() - SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) -endif() -SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) - -cache_third_party(extern_threadpool - REPOSITORY ${THREADPOOL_REPOSITORY} - TAG ${THREADPOOL_TAG} - DIR THREADPOOL_SOURCE_DIR) - -SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}) -INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR}) - -ExternalProject_Add( - extern_threadpool - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${THREADPOOL_DOWNLOAD_CMD}" - PREFIX ${THREADPOOL_PREFIX_DIR} - SOURCE_DIR ${THREADPOOL_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -add_library(simple_threadpool INTERFACE) - -add_dependencies(simple_threadpool extern_threadpool) diff --git a/speechnn/cmake/third_party/version.cmake b/speechnn/cmake/third_party/version.cmake deleted file mode 100644 index c3780ee6..00000000 --- a/speechnn/cmake/third_party/version.cmake +++ /dev/null @@ -1,15 +0,0 @@ -function(get_version) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/src/.version version) - string(STRIP ${version} version) - execute_process(COMMAND git log -n1 --format=%H src/.version - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE version_commit - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND git rev-list --count "${version_commit}..HEAD" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE patch_number) - string(STRIP ${patch_number} patch_number) - - set(KALDI_VERSION ${version} PARENT_SCOPE) - set(KALDI_PATCH_NUMBER ${patch_number} PARENT_SCOPE) -endfunction() diff --git a/speechnn/core/transformers/.gitkeep b/speechnn/core/transformers/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/core/transformers/README.md b/speechnn/core/transformers/README.md deleted file mode 100644 index edbcb9cc..00000000 --- a/speechnn/core/transformers/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# Fast Transformers for Speech - -- Conformer -- Transformer - -## Reference - -* https://github.com/NVIDIA/FasterTransformer.git -* https://github.com/idiap/fast-transformers diff --git a/speechnn/env.sh b/speechnn/env.sh deleted file mode 100644 index c7754d65..00000000 --- a/speechnn/env.sh +++ /dev/null @@ -1,10 +0,0 @@ -export MAIN_ROOT=${PWD} - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}:/usr/local/bin/ -export LC_ALL=C - -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ diff --git a/speechnn/examples/.gitkeep b/speechnn/examples/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/examples/CMakeLists.txt b/speechnn/examples/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/CMakeLists.txt b/speechnn/speechnn/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/decoder/CMakeLists.txt b/speechnn/speechnn/decoder/CMakeLists.txt deleted file mode 100644 index 259261bd..00000000 --- a/speechnn/speechnn/decoder/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -aux_source_directory(. DIR_LIB_SRCS) -add_library(decoder STATIC ${DIR_LIB_SRCS}) diff --git a/speechnn/speechnn/frontend/CMakeLists.txt b/speechnn/speechnn/frontend/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/frontend/audio/CMakeLists.txt b/speechnn/speechnn/frontend/audio/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/frontend/text/CMakeLists.txt b/speechnn/speechnn/frontend/text/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/model/CMakeLists.txt b/speechnn/speechnn/model/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/nn/CMakeLists.txt b/speechnn/speechnn/nn/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/protocol/CMakeLists.txt b/speechnn/speechnn/protocol/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechnn/speechnn/utils/CMakeLists.txt b/speechnn/speechnn/utils/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md index 71d5f91b..22e0009d 100644 --- a/tests/benchmark/conformer/README.md +++ b/tests/benchmark/conformer/README.md @@ -43,16 +43,6 @@ bash prepare.sh bash run.sh ``` -### Analyse the sp -``` -bash run_analysis_sp.sh -``` - -### Analyse the mp -``` -bash run_analysis_mp.sh -``` - ### The log ``` {"log_file": "recoder_sp_bs16_fp32_ngpu1.txt", diff --git a/tests/benchmark/conformer/analysis.py b/tests/benchmark/conformer/analysis.py deleted file mode 100644 index 610791c8..00000000 --- a/tests/benchmark/conformer/analysis.py +++ /dev/null @@ -1,345 +0,0 @@ -# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import print_function - -import argparse -import json -import re -import traceback - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--filename", type=str, help="The name of log which need to analysis.") - parser.add_argument( - "--log_with_profiler", - type=str, - help="The path of train log with profiler") - parser.add_argument( - "--profiler_path", type=str, help="The path of profiler timeline log.") - parser.add_argument( - "--keyword", type=str, help="Keyword to specify analysis data") - parser.add_argument( - "--separator", - type=str, - default=None, - help="Separator of different field in log") - parser.add_argument( - '--position', type=int, default=None, help='The position of data field') - parser.add_argument( - '--range', - type=str, - default="", - help='The range of data field to intercept') - parser.add_argument( - '--base_batch_size', type=int, help='base_batch size on gpu') - parser.add_argument( - '--skip_steps', - type=int, - default=0, - help='The number of steps to be skipped') - parser.add_argument( - '--model_mode', - type=int, - default=-1, - help='Analysis mode, default value is -1') - parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit') - parser.add_argument( - '--model_name', - type=str, - default=0, - help='training model_name, transformer_base') - parser.add_argument( - '--mission_name', type=str, default=0, help='training mission name') - parser.add_argument( - '--direction_id', type=int, default=0, help='training direction_id') - parser.add_argument( - '--run_mode', - type=str, - default="sp", - help='multi process or single process') - parser.add_argument( - '--index', - type=int, - default=1, - help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}') - parser.add_argument( - '--gpu_num', type=int, default=1, help='nums of training gpus') - parser.add_argument( - '--use_num', type=int, default=1, help='nums of used recoders') - args = parser.parse_args() - args.separator = None if args.separator == "None" else args.separator - return args - - -def _is_number(num): - pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') - result = pattern.match(num) - if result: - return True - else: - return False - - -class TimeAnalyzer(object): - def __init__(self, - filename, - keyword=None, - separator=None, - position=None, - range="-1"): - if filename is None: - raise Exception("Please specify the filename!") - - if keyword is None: - raise Exception("Please specify the keyword!") - - self.filename = filename - self.keyword = keyword - self.separator = separator - self.position = position - self.range = range - self.records = None - self._distil() - - def _distil(self): - self.records = [] - with open(self.filename, "r") as f_object: - lines = f_object.readlines() - for line in lines: - if self.keyword not in line: - continue - try: - result = None - - # Distil the string from a line. - line = line.strip() - line_words = line.split( - self.separator) if self.separator else line.split() - print("line_words", line_words) - if args.position: - result = line_words[self.position] - else: - # Distil the string following the keyword. - for i in range(len(line_words) - 1): - if line_words[i] == self.keyword: - result = line_words[i + 1] - break - - # Distil the result from the picked string. - if not self.range: - result = result[0:] - elif _is_number(self.range): - result = result[0:int(self.range)] - else: - result = result[int(self.range.split(":")[0]):int( - self.range.split(":")[1])] - self.records.append(float(result)) - except Exception as exc: - pass - #print("line is: {}; separator={}; position={}".format(line, self.separator, self.position)) - self.records.sort() - self.records = self.records[:args.use_num] - print("records", self.records) - print("Extract {} records: separator={}; position={}".format( - len(self.records), self.separator, self.position)) - - def _get_fps(self, - mode, - batch_size, - gpu_num, - avg_of_records, - run_mode, - unit=None): - if mode == -1 and run_mode == 'sp': - assert unit, "Please set the unit when mode is -1." - fps = gpu_num * avg_of_records - elif mode == -1 and run_mode == 'mp': - assert unit, "Please set the unit when mode is -1." - fps = gpu_num * avg_of_records #temporarily, not used now - print("------------this is mp") - elif mode == 0: - # s/step -> samples/s - fps = (batch_size * gpu_num) / avg_of_records - unit = "samples/s" - elif mode == 1: - # steps/s -> steps/s - fps = avg_of_records - unit = "steps/s" - elif mode == 2: - # s/step -> steps/s - fps = 1 / avg_of_records - unit = "steps/s" - elif mode == 3: - # steps/s -> samples/s - fps = batch_size * gpu_num * avg_of_records - unit = "samples/s" - elif mode == 4: - # s/epoch -> s/epoch - fps = avg_of_records - unit = "s/epoch" - else: - ValueError("Unsupported analysis mode.") - - return fps, unit - - def analysis(self, - batch_size, - gpu_num=1, - skip_steps=0, - mode=-1, - run_mode='sp', - unit=None): - if batch_size <= 0: - print("base_batch_size should larger than 0.") - return 0, '' - - if len( - self.records - ) <= skip_steps: # to address the condition which item of log equals to skip_steps - print("no records") - return 0, '' - - sum_of_records = 0 - sum_of_records_skipped = 0 - skip_min = self.records[skip_steps] - skip_max = self.records[skip_steps] - - count = len(self.records) - for i in range(count): - sum_of_records += self.records[i] - if i >= skip_steps: - sum_of_records_skipped += self.records[i] - if self.records[i] < skip_min: - skip_min = self.records[i] - if self.records[i] > skip_max: - skip_max = self.records[i] - - avg_of_records = sum_of_records / float(count) - avg_of_records_skipped = sum_of_records_skipped / float(count - - skip_steps) - - fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records, - run_mode, unit) - fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num, - avg_of_records_skipped, run_mode, unit) - if mode == -1: - print("average ips of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average ips of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) - print("\tMin: %.3f %s" % (skip_min, fps_unit)) - print("\tMax: %.3f %s" % (skip_max, fps_unit)) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - elif mode == 1 or mode == 3: - print("average latency of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f steps/s" % avg_of_records) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f steps/s" % avg_of_records_skipped) - print("\tMin: %.3f steps/s" % skip_min) - print("\tMax: %.3f steps/s" % skip_max) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - elif mode == 0 or mode == 2: - print("average latency of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f s/step" % avg_of_records) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f s/step" % avg_of_records_skipped) - print("\tMin: %.3f s/step" % skip_min) - print("\tMax: %.3f s/step" % skip_max) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - - return round(fps_skipped, 3), fps_unit - - -if __name__ == "__main__": - args = parse_args() - run_info = dict() - run_info["log_file"] = args.filename - run_info["model_name"] = args.model_name - run_info["mission_name"] = args.mission_name - run_info["direction_id"] = args.direction_id - run_info["run_mode"] = args.run_mode - run_info["index"] = args.index - run_info["gpu_num"] = args.gpu_num - run_info["FINAL_RESULT"] = 0 - run_info["JOB_FAIL_FLAG"] = 0 - - try: - if args.index == 1: - if args.gpu_num == 1: - run_info["log_with_profiler"] = args.log_with_profiler - run_info["profiler_path"] = args.profiler_path - analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, - args.position, args.range) - run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis( - batch_size=args.base_batch_size, - gpu_num=args.gpu_num, - skip_steps=args.skip_steps, - mode=args.model_mode, - run_mode=args.run_mode, - unit=args.ips_unit) - # if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0: - # run_info["JOB_FAIL_FLAG"] = 1 - elif args.index == 3: - run_info["FINAL_RESULT"] = {} - records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead', - None, 3, '').records - records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead', - None, 5).records - records_ct_total = TimeAnalyzer(args.filename, 'Computation time', - None, 3, '').records - records_gm_total = TimeAnalyzer(args.filename, - 'GpuMemcpy Calls', - None, 4, '').records - records_gm_ratio = TimeAnalyzer(args.filename, - 'GpuMemcpy Calls', - None, 6).records - records_gmas_total = TimeAnalyzer(args.filename, - 'GpuMemcpyAsync Calls', - None, 4, '').records - records_gms_total = TimeAnalyzer(args.filename, - 'GpuMemcpySync Calls', - None, 4, '').records - run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[ - 0] if records_fo_total else 0 - run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[ - 0] if records_fo_ratio else 0 - run_info["FINAL_RESULT"][ - "ComputationTime_Total"] = records_ct_total[ - 0] if records_ct_total else 0 - run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[ - 0] if records_gm_total else 0 - run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[ - 0] if records_gm_ratio else 0 - run_info["FINAL_RESULT"][ - "GpuMemcpyAsync_Total"] = records_gmas_total[ - 0] if records_gmas_total else 0 - run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[ - 0] if records_gms_total else 0 - else: - print("Not support!") - except Exception: - traceback.print_exc() - print("{}".format(json.dumps(run_info)) - ) # it's required, for the log file path insert to the database diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh index 8f03fd1b..c5fae06a 100644 --- a/tests/benchmark/conformer/prepare.sh +++ b/tests/benchmark/conformer/prepare.sh @@ -1,5 +1,6 @@ -source ../../../tools/venv/bin/activate - +cd ../../../ +pip install -e . # 安装pdspeech +cd - #Enter the example dir pushd ../../../examples/aishell/s1 diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index f9244f5d..79beb4e9 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -1,8 +1,12 @@ # 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 # 执行目录:需说明 -CUR_DIR=${PWD} -source ../../../tools/venv/bin/activate +CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer +cd ../../../ +log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录 +cd ${CUR_DIR} +sed -i '/set\ -xe/d' run_benchmark.sh + #cd ** pushd ../../../examples/aishell/s1 # 1 安装该模型需要的依赖 (如需开启优化策略请注明) @@ -10,31 +14,37 @@ pushd ../../../examples/aishell/s1 source path.sh +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; +mkdir -p conf/benchmark +#yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml +cp conf/conformer.yaml conf/benchmark/conformer.yaml +sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) -bs_item=(16) -config_path=conf/conformer.yaml +bs_item=(16 30) +config_path=conf/benchmark/conformer.yaml seed=0 output=exp/conformer profiler_options=None +model_item=conformer for fp_item in ${fp_item_list[@]}; do - for batch_size in ${bs_item[@]} + for bs_item in ${bs_item[@]} do rm exp -rf + log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR} - rm exp -rf - echo "index is speed, 1gpus, begin, conformer" + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + sleep 60 + log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 + echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR} + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + sleep 60 done done popd -mkdir -p log -bash run_analysis_sp.sh > log/log_sp.out -bash run_analysis_mp.sh > log/log_mp.out diff --git a/tests/benchmark/conformer/run_analysis_mp.sh b/tests/benchmark/conformer/run_analysis_mp.sh deleted file mode 100644 index f68dd1e4..00000000 --- a/tests/benchmark/conformer/run_analysis_mp.sh +++ /dev/null @@ -1,12 +0,0 @@ -python analysis.py \ - --filename "recoder_mp_bs16_fp32_ngpu8.txt" \ - --keyword "ips[sent./sec]:" \ - --base_batch_size 16 \ - --model_name "Conformer" \ - --mission_name "eight gpu" \ - --run_mode "mp" \ - --ips_unit "sent./sec" \ - --gpu_num 8 \ - --use_num 480 \ - --separator " " \ - --direction_id "1" diff --git a/tests/benchmark/conformer/run_analysis_sp.sh b/tests/benchmark/conformer/run_analysis_sp.sh deleted file mode 100644 index e521db88..00000000 --- a/tests/benchmark/conformer/run_analysis_sp.sh +++ /dev/null @@ -1,12 +0,0 @@ -python analysis.py \ - --filename "recoder_sp_bs16_fp32_ngpu1.txt" \ - --keyword "ips[sent./sec]:" \ - --base_batch_size 16 \ - --model_name "Conformer" \ - --mission_name "one gpu" \ - --run_mode "sp" \ - --ips_unit "sent./sec" \ - --gpu_num 1 \ - --use_num 60 \ - --separator " " \ - --direction_id "1" diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index c03a08f3..56b63e76 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -12,17 +12,24 @@ function _set_params(){ profiler_options=${6:-"None"} batch_size=${7:-"32"} fp_item=${8:-"fp32"} - TRAIN_LOG_DIR=${9:-$(pwd)} - + model_item=${9:-"conformer"} benchmark_max_step=0 - run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 +# 添加日志解析需要的参数 + base_batch_size=${batch_size} + mission_name="语音识别" + direction_id="1" + ips_unit="sent./sec" + skip_steps=10 # 解析日志,有些模型前几个step耗时长,需要跳过 (必填) + keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填) + index="1" + model_name=${model_item}_bs${batch_size}_${fp_item} # 以下不用修改 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} - log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt + log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu} } function _train(){ @@ -36,11 +43,9 @@ function _train(){ --benchmark-batch-size ${batch_size} --benchmark-max-step ${benchmark_max_step} " - echo "run_mode "${run_mode} - case ${run_mode} in - sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;; - mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;; + sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;; + mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac echo ${train_cmd} @@ -61,5 +66,8 @@ function _train(){ fi } +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -_train +# _train # 如果只想产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 + diff --git a/tests/benchmark/pwgan/run_all.sh b/tests/benchmark/pwgan/run_all.sh index 5b66d5f5..51deaf9f 100755 --- a/tests/benchmark/pwgan/run_all.sh +++ b/tests/benchmark/pwgan/run_all.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash +log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录 + stage=0 stop_stage=100 - +sed -i '/set\ -xe/d' run_benchmark.sh # 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 # 执行目录:需说明 cd ../../../ @@ -33,20 +35,22 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then fi # 3 批量运行(如不方便批量,1,2需放到单个模型中) if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - model_mode_list=(pwg) + model_mode_list=(pwgan) fp_item_list=(fp32) # 满 bs 是 26 bs_item_list=(6 26) for model_mode in ${model_mode_list[@]}; do for fp_item in ${fp_item_list[@]}; do for bs_item in ${bs_item_list[@]}; do - echo "index is speed, 1gpus, begin, ${model_name}" + log_name=speech_${model_mode}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 + echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp - CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min) + CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) sleep 60 - echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" + log_name=speech_${model_mode}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 + echo "index is speed, 8gpus, run_mode is multi_process, begin, ${log_name}" run_mode=mp - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 # sleep 60 done done diff --git a/tests/benchmark/pwgan/run_benchmark.sh b/tests/benchmark/pwgan/run_benchmark.sh index f0fcaf44..d6a52d35 100755 --- a/tests/benchmark/pwgan/run_benchmark.sh +++ b/tests/benchmark/pwgan/run_benchmark.sh @@ -7,14 +7,22 @@ function _set_params(){ batch_size=${2:-"8"} fp_item=${3:-"fp32"} # fp32|fp16 max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断 - model_name=${5:-"model_name"} + model_item=${5:-"model_item"} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 - +# 添加日志解析需要的参数 + base_batch_size=${batch_size} + mission_name="语音合成" + direction_id="1" + ips_unit="sequences/sec" + skip_steps=10 # 解析日志,有些模型前几个step耗时长,需要跳过 (必填) + keyword="avg_ips:" # 解析日志,筛选出数据所在行的关键字 (必填) + index="1" + model_name=${model_item}_bs${batch_size}_${fp_item} # 以下不用修改 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} - log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} + log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} } function _train(){ echo "Train on ${num_gpu_devices} GPUs" @@ -29,8 +37,8 @@ function _train(){ --run-benchmark=true" case ${run_mode} in - sp) train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;; - mp) train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}" + sp) train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=1 ${train_cmd}" ;; + mp) train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=8 ${train_cmd}" log_parse_file="mylog/workerlog.0" ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac @@ -52,5 +60,8 @@ function _train(){ fi } +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -_train \ No newline at end of file +# _train # 如果只想产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 + diff --git a/tests/chains/speedyspeech/test.sh b/tests/chains/speedyspeech/test.sh index f4441335..ccabc319 100755 --- a/tests/chains/speedyspeech/test.sh +++ b/tests/chains/speedyspeech/test.sh @@ -324,7 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` CUDA_VISIBLE_DEVICES=${gpu} - cmd="${python} ${run_train} --nprocs=$nump" + cmd="${python} ${run_train} --ngpu=$nump" else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi diff --git a/third_party/python_kaldi_features/setup.py b/third_party/python_kaldi_features/setup.py index 47c77718..c76f23b5 100644 --- a/third_party/python_kaldi_features/setup.py +++ b/third_party/python_kaldi_features/setup.py @@ -3,12 +3,16 @@ try: except ImportError: from distutils.core import setup -setup(name='python_speech_features', - version='0.6', - description='Python Speech Feature extraction', - author='James Lyons', - author_email='james.lyons0@gmail.com', +with open("requirements.txt", encoding="utf-8-sig") as f: + requirements = f.readlines() + +setup(name='paddlespeech_feat', + version='0.0.1a', + description='python speech feature extraction in paddlespeech', + install_requires=requirements, + author="PaddlePaddle Speech and Language Team", + author_email="paddlesl@baidu.com", license='MIT', - url='https://github.com/jameslyons/python_speech_features', + url='https://github.com/PaddlePaddle/PaddleSpeech', packages=['python_speech_features'], ) diff --git a/tools/Makefile b/tools/Makefile index e2aba8fe..2a2c1463 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -51,7 +51,7 @@ soxbindings.done: touch soxbindings.done mfa.done: - test -d montreal-forced-aligner || $(WGET) https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz + test -d montreal-forced-aligner || $(WGET) https://paddlespeech.bj.bcebos.com/Parakeet/montreal-forced-aligner_linux.tar.gz tar xvf montreal-forced-aligner_linux.tar.gz touch mfa.done diff --git a/tools/extras/install_mfa_v1.sh b/tools/extras/install_mfa_v1.sh new file mode 100755 index 00000000..fa5b4801 --- /dev/null +++ b/tools/extras/install_mfa_v1.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +test -d montreal-forced-aligner || wget --no-check-certificate https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz +tar xvf montreal-forced-aligner_linux.tar.gz \ No newline at end of file diff --git a/tools/extras/install_mfa.sh b/tools/extras/install_mfa_v2.sh similarity index 100% rename from tools/extras/install_mfa.sh rename to tools/extras/install_mfa_v2.sh diff --git a/tools/extras/install_miniconda.sh b/tools/extras/install_miniconda.sh index 3d1909af..c6ee4b36 100755 --- a/tools/extras/install_miniconda.sh +++ b/tools/extras/install_miniconda.sh @@ -13,6 +13,8 @@ else fi bash Miniconda3-latest-Linux-x86_64.sh -b +$HOME/miniconda3/bin/conda init + $HOME/miniconda3/bin/python -m pip install --user tqdm $HOME/miniconda3/bin/python -m pip install --user scikit-learn $HOME/miniconda3/bin/python -m pip install --user librosa diff --git a/tools/extras/install_sclite.sh b/tools/extras/install_sclite.sh new file mode 100755 index 00000000..da89e60d --- /dev/null +++ b/tools/extras/install_sclite.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +WGET="wget --no-check-certificate" + +# SCTK official repo does not have version tags. Here's the mapping: +# # 2.4.9 = 659bc36; 2.4.10 = d914e1b; 2.4.11 = 20159b5. +SCTK_GITHASH=20159b5 +SCTK_CXFLAGS="-w -march=native" +CFLAGS="CFLAGS=${SCTK_CXFLAGS}" +CXXFLAGS="CXXFLAGS=-std=c++11 ${SCTK_CXFLAGS}" + +MAKE=make + + +${WGET} -nv -T 10 -t 3 -O sctk-${SCTK_GITHASH}.tar.gz https://github.com/usnistgov/SCTK/archive/${SCTK_GITHASH}.tar.gz; +tar zxvf sctk-${SCTK_GITHASH}.tar.gz +rm -rf sctk-${SCTK_GITHASH} sctk +mv SCTK-${SCTK_GITHASH}* sctk-${SCTK_GITHASH} +ln -s sctk-${SCTK_GITHASH} sctk +touch sctk-${SCTK_GITHASH}.tar.gz + +rm -f sctk/.compiled +CFLAGS="${SCTK_CXFLAGS}" CXXFLAGS="-std=c++11 ${SCTK_CXFLAGS}" ${MAKE} -C sctk config +CFLAGS="${SCTK_CXFLAGS}" CXXFLAGS="-std=c++11 ${SCTK_CXFLAGS}" ${MAKE} -C sctk all doc +${MAKE} -C sctk install +touch sctk/.compiled \ No newline at end of file diff --git a/tools/extras/install_sox.sh b/tools/extras/install_sox.sh new file mode 100755 index 00000000..ddb1151c --- /dev/null +++ b/tools/extras/install_sox.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +apt install -y libvorbis-dev libmp3lame-dev libmad-ocaml-dev +test -d sox-14.4.2 || wget --no-check-certificate https://nchc.dl.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.gz +tar -xvzf sox-14.4.2.tar.gz -C . +cd sox-14.4.2 && ./configure --prefix=/usr/ && make -j4 && make install \ No newline at end of file diff --git a/tools/extras/install_venv.sh b/tools/extras/install_venv.sh new file mode 100755 index 00000000..16679b79 --- /dev/null +++ b/tools/extras/install_venv.sh @@ -0,0 +1,36 @@ + +#!/bin/bash +# copy from Espnet + +set -euo pipefail + +if [ $# -eq 0 ] || [ $# -gt 2 ]; then + echo "Usage: $0 [venv-path]" + echo "e.g." + echo "$0 \$(which python3)" + exit 1; +elif [ $# -eq 2 ]; then + PYTHON="$1" + VENV="$2" +elif [ $# -eq 1 ]; then + PYTHON="$1" + VENV="venv" +fi + +if ! "${PYTHON}" -m venv --help > /dev/null 2>&1; then + echo "Error: ${PYTHON} is not Python3?" + exit 1 +fi +if [ -e activate_python.sh ]; then + echo "Warning: activate_python.sh already exists. It will be overwritten" +fi + +"${PYTHON}" -m venv ${VENV} +cat << EOF > activate_python.sh +#!/usr/bin/env bash +# THIS FILE IS GENERATED BY tools/setup_venv.sh +. $(cd ${VENV}; pwd)/bin/activate +EOF + +. ./activate_python.sh +python3 -m pip install -U pip wheel \ No newline at end of file