diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook index 761edbc01..5a409e062 100644 --- a/.pre-commit-hooks/copyright-check.hook +++ b/.pre-commit-hooks/copyright-check.hook @@ -19,7 +19,7 @@ import subprocess import platform COPYRIGHT = ''' -Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -128,4 +128,4 @@ def main(argv=None): if __name__ == '__main__': - exit(main()) \ No newline at end of file + exit(main()) diff --git a/README.md b/README.md index 0cb99d1c6..fbbb1480f 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,10 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo](./demos/TTSArmLinux). +- 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized. +- 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3). +- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux). +- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3). - 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3). - 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition). - 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/). @@ -575,14 +578,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Text Frontend -   - - tn / g2p - + Text Frontend +   + + tn / g2p + - Acoustic Model + Acoustic Model Tacotron2 LJSpeech / CSMSC @@ -617,6 +620,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en + + DiffSinger + Opencpop + + DiffSinger-opencpop + + Vocoder WaveFlow @@ -627,9 +637,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r Parallel WaveGAN - LJSpeech / VCTK / CSMSC / AISHELL-3 + LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop - PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 + PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 / PWGAN-opencpop @@ -648,9 +658,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r HiFiGAN - LJSpeech / VCTK / CSMSC / AISHELL-3 + LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop - HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 + HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 / HiFiGAN-opencpop diff --git a/README_cn.md b/README_cn.md index 0f2adf811..4d991f3e8 100644 --- a/README_cn.md +++ b/README_cn.md @@ -183,7 +183,10 @@ - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 ### 近期更新 -- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例](./demos/TTSArmLinux)。 +- 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例,包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5),效果持续优化中。 +- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。 +- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。 +- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。 - 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。 - 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。 - 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。 @@ -574,43 +577,50 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 tn / g2p - - - 声学模型 + + + 声学模型 Tacotron2 LJSpeech / CSMSC tacotron2-ljspeech / tacotron2-csmsc - - + + Transformer TTS LJSpeech transformer-ljspeech - - + + SpeedySpeech CSMSC speedyspeech-csmsc - - + + FastSpeech2 LJSpeech / VCTK / CSMSC / AISHELL-3 / ZH_EN / finetune fastspeech2-ljspeech / fastspeech2-vctk / fastspeech2-csmsc / fastspeech2-aishell3 / fastspeech2-zh_en / fastspeech2-finetune - - + + ERNIE-SAT VCTK / AISHELL-3 / ZH_EN ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en - + + + DiffSinger + Opencpop + + DiffSinger-opencpop + + 声码器 WaveFlow @@ -621,9 +631,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 Parallel WaveGAN - LJSpeech / VCTK / CSMSC / AISHELL-3 + LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop - PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 + PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 / PWGAN-opencpop @@ -642,9 +652,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 HiFiGAN - LJSpeech / VCTK / CSMSC / AISHELL-3 + LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop - HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 + HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 / HiFiGAN-opencpop @@ -701,6 +711,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 + **声音分类** diff --git a/demos/TTSArmLinux/.gitignore b/demos/TTSArmLinux/.gitignore index 13135e376..f18480d7a 100644 --- a/demos/TTSArmLinux/.gitignore +++ b/demos/TTSArmLinux/.gitignore @@ -1,4 +1,8 @@ +# 目录 build/ output/ libs/ models/ + +# 符号连接 +dict diff --git a/demos/TTSArmLinux/README.md b/demos/TTSArmLinux/README.md index 32b85e0a4..a4ccba6c8 100644 --- a/demos/TTSArmLinux/README.md +++ b/demos/TTSArmLinux/README.md @@ -10,9 +10,9 @@ ### 安装依赖 -``` +```bash # Ubuntu -sudo apt install build-essential cmake wget tar unzip +sudo apt install build-essential cmake pkg-config wget tar unzip # CentOS sudo yum groupinstall "Development Tools" @@ -25,15 +25,13 @@ sudo yum install cmake wget tar unzip 可用以下命令下载: -``` -git clone https://github.com/PaddlePaddle/PaddleSpeech.git -cd PaddleSpeech/demos/TTSArmLinux +```bash ./download.sh ``` ### 编译 Demo -``` +```bash ./build.sh ``` @@ -43,12 +41,18 @@ cd PaddleSpeech/demos/TTSArmLinux ### 运行 -``` +你可以修改 `./front.conf` 中 `--phone2id_path` 参数为你自己的声学模型的 `phone_id_map.txt` 。 + +```bash ./run.sh +./run.sh --sentence "语音合成测试" +./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav +./run.sh --help ``` -将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件,保存在 `output` 文件夹中。 +目前只支持中文合成,出现任何英文都会导致程序崩溃。 +如果未指定`--wav_file`,默认输出到`./output/tts.wav`。 ## 手动编译 Paddle Lite 库 diff --git a/demos/TTSArmLinux/build-depends.sh b/demos/TTSArmLinux/build-depends.sh new file mode 120000 index 000000000..fd3aec9c8 --- /dev/null +++ b/demos/TTSArmLinux/build-depends.sh @@ -0,0 +1 @@ +src/TTSCppFrontend/build-depends.sh \ No newline at end of file diff --git a/demos/TTSArmLinux/build.sh b/demos/TTSArmLinux/build.sh index c872e5749..5d31173ef 100755 --- a/demos/TTSArmLinux/build.sh +++ b/demos/TTSArmLinux/build.sh @@ -1,8 +1,11 @@ #!/bin/bash set -e +set -x cd "$(dirname "$(realpath "$0")")" +BASE_DIR="$PWD" + # load configure . ./config.sh @@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")" echo "ARM_ABI is ${ARM_ABI}" echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}" -rm -rf build -mkdir -p build -cd build +echo "Build depends..." +./build-depends.sh "$@" +mkdir -p "$BASE_DIR/build" +cd "$BASE_DIR/build" cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src -make + +if [ "$*" = "" ]; then + make -j$(nproc) +else + make "$@" +fi echo "make successful!" diff --git a/demos/TTSArmLinux/clean.sh b/demos/TTSArmLinux/clean.sh index 1ea365566..2743801c3 100755 --- a/demos/TTSArmLinux/clean.sh +++ b/demos/TTSArmLinux/clean.sh @@ -1,8 +1,11 @@ #!/bin/bash set -e +set -x cd "$(dirname "$(realpath "$0")")" +BASE_DIR="$PWD" + # load configure . ./config.sh @@ -12,3 +15,9 @@ set -x rm -rf "$OUTPUT_DIR" rm -rf "$LIBS_DIR" rm -rf "$MODELS_DIR" +rm -rf "$BASE_DIR/build" + +"$BASE_DIR/src/TTSCppFrontend/clean.sh" + +# 符号连接 +rm "$BASE_DIR/dict" diff --git a/demos/TTSArmLinux/config.sh b/demos/TTSArmLinux/config.sh index 0a04f18ee..bf38d7d6d 100644 --- a/demos/TTSArmLinux/config.sh +++ b/demos/TTSArmLinux/config.sh @@ -10,5 +10,6 @@ OUTPUT_DIR="${PWD}/output" PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_extra.with_cv/cxx" #PADDLE_LITE_DIR="/path/to/Paddle-Lite/build.lite.linux.${ARM_ABI}.gcc/inference_lite_lib.armlinux.${ARM_ABI}/cxx" -AM_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb" -VOC_MODEL_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb" +ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb" +VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb" +FRONT_CONF="${PWD}/front.conf" diff --git a/demos/TTSArmLinux/download.sh b/demos/TTSArmLinux/download.sh index 560374bc9..7eaa836a5 100755 --- a/demos/TTSArmLinux/download.sh +++ b/demos/TTSArmLinux/download.sh @@ -3,6 +3,8 @@ set -e cd "$(dirname "$(realpath "$0")")" +BASE_DIR="$PWD" + # load configure . ./config.sh @@ -38,6 +40,10 @@ download() { echo '=======================' } +######################################## + +echo "Download models..." + download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ 'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ '39e0c6604f97c70f5d13c573d7e709b9' \ @@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \ "$MODELS_DIR" echo "Done." + +######################################## + +echo "Download dictionary files..." + +ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/" + +"$BASE_DIR/src/TTSCppFrontend/download.sh" diff --git a/demos/TTSArmLinux/front.conf b/demos/TTSArmLinux/front.conf new file mode 100644 index 000000000..04bd2d97f --- /dev/null +++ b/demos/TTSArmLinux/front.conf @@ -0,0 +1,21 @@ +# jieba conf +--jieba_dict_path=./dict/jieba/jieba.dict.utf8 +--jieba_hmm_path=./dict/jieba/hmm_model.utf8 +--jieba_user_dict_path=./dict/jieba/user.dict.utf8 +--jieba_idf_path=./dict/jieba/idf.utf8 +--jieba_stop_word_path=./dict/jieba/stop_words.utf8 + +# dict conf fastspeech2_0.4 +--seperate_tone=false +--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict +--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict + +# dict conf speedyspeech_0.5 +#--seperate_tone=true +#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict +#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt +#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt + +# dict of tranditional_to_simplified +--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt diff --git a/demos/TTSArmLinux/run.sh b/demos/TTSArmLinux/run.sh index efcb61b5b..d0860f044 100755 --- a/demos/TTSArmLinux/run.sh +++ b/demos/TTSArmLinux/run.sh @@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")" . ./config.sh # create dir -rm -rf "$OUTPUT_DIR" mkdir -p "$OUTPUT_DIR" # run -for i in {1..10}; do - (set -x; ./build/paddlespeech_tts_demo "$AM_MODEL_PATH" "$VOC_MODEL_PATH" $i "$OUTPUT_DIR/$i.wav") -done - -ls -lh "$OUTPUT_DIR"/*.wav +set -x +./build/paddlespeech_tts_demo \ + --front_conf "$FRONT_CONF" \ + --acoustic_model "$ACOUSTIC_MODEL_PATH" \ + --vocoder "$VOCODER_PATH" \ + "$@" +# end diff --git a/demos/TTSArmLinux/src/CMakeLists.txt b/demos/TTSArmLinux/src/CMakeLists.txt index e1076af92..f8240d0ce 100644 --- a/demos/TTSArmLinux/src/CMakeLists.txt +++ b/demos/TTSArmLinux/src/CMakeLists.txt @@ -1,4 +1,18 @@ cmake_minimum_required(VERSION 3.10) +project(paddlespeech_tts_demo) + + +########## Global Options ########## + +option(WITH_FRONT_DEMO "Build front demo" OFF) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(ABSL_PROPAGATE_CXX_STD ON) + + +########## ARM Options ########## + set(CMAKE_SYSTEM_NAME Linux) if(ARM_ABI STREQUAL "armv8") set(CMAKE_SYSTEM_PROCESSOR aarch64) @@ -13,14 +27,16 @@ else() return() endif() -project(paddlespeech_tts_demo) + +########## Paddle Lite Options ########## + message(STATUS "TARGET ARCH ABI: ${ARM_ABI}") message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}") include_directories(${PADDLE_LITE_DIR}/include) link_directories(${PADDLE_LITE_DIR}/libs/${ARM_ABI}) link_directories(${PADDLE_LITE_DIR}/lib) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + if(ARM_ABI STREQUAL "armv8") set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}") set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}") @@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf") set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" ) endif() + +########## Dependencies ########## + find_package(OpenMP REQUIRED) if(OpenMP_FOUND OR OpenMP_CXX_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") @@ -43,5 +62,19 @@ else() return() endif() + +############### tts cpp frontend ############### + +add_subdirectory(TTSCppFrontend) + +include_directories( + TTSCppFrontend/src + third-party/build/src/cppjieba/include + third-party/build/src/limonp/include +) + + +############### paddlespeech_tts_demo ############### + add_executable(paddlespeech_tts_demo main.cc) -target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared) +target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front) diff --git a/demos/TTSArmLinux/src/Predictor.hpp b/demos/TTSArmLinux/src/Predictor.hpp index 221d51fc1..f173abb5c 100644 --- a/demos/TTSArmLinux/src/Predictor.hpp +++ b/demos/TTSArmLinux/src/Predictor.hpp @@ -1,7 +1,20 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include #include -#include #include +#include #include #include #include @@ -9,32 +22,84 @@ using namespace paddle::lite_api; -typedef int16_t WavDataType; +class PredictorInterface { + public: + virtual ~PredictorInterface() = 0; + virtual bool Init(const std::string &AcousticModelPath, + const std::string &VocoderPath, + PowerMode cpuPowerMode, + int cpuThreadNum, + // WAV采样率(必须与模型输出匹配) + // 如果播放速度和音调异常,请修改采样率 + // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 + uint32_t wavSampleRate) = 0; + virtual std::shared_ptr LoadModel( + const std::string &modelPath, + int cpuThreadNum, + PowerMode cpuPowerMode) = 0; + virtual void ReleaseModel() = 0; + virtual bool RunModel(const std::vector &phones) = 0; + virtual std::unique_ptr GetAcousticModelOutput( + const std::vector &phones) = 0; + virtual std::unique_ptr GetVocoderOutput( + std::unique_ptr &&amOutput) = 0; + virtual void VocoderOutputToWav( + std::unique_ptr &&vocOutput) = 0; + virtual void SaveFloatWav(float *floatWav, int64_t size) = 0; + virtual bool IsLoaded() = 0; + virtual float GetInferenceTime() = 0; + virtual int GetWavSize() = 0; + // 获取WAV持续时间(单位:毫秒) + virtual float GetWavDuration() = 0; + // 获取RTF(合成时间 / 音频时长) + virtual float GetRTF() = 0; + virtual void ReleaseWav() = 0; + virtual bool WriteWavToFile(const std::string &wavPath) = 0; +}; -class Predictor { -public: - bool Init(const std::string &AMModelPath, const std::string &VOCModelPath, int cpuThreadNum, const std::string &cpuPowerMode) { +PredictorInterface::~PredictorInterface() {} + +// WavDataType: WAV数据类型 +// 可在 int16_t 和 float 之间切换, +// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV +template +class Predictor : public PredictorInterface { + public: + bool Init(const std::string &AcousticModelPath, + const std::string &VocoderPath, + PowerMode cpuPowerMode, + int cpuThreadNum, + // WAV采样率(必须与模型输出匹配) + // 如果播放速度和音调异常,请修改采样率 + // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 + uint32_t wavSampleRate) override { // Release model if exists ReleaseModel(); - AM_predictor_ = LoadModel(AMModelPath, cpuThreadNum, cpuPowerMode); - if (AM_predictor_ == nullptr) { + acoustic_model_predictor_ = + LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode); + if (acoustic_model_predictor_ == nullptr) { return false; } - VOC_predictor_ = LoadModel(VOCModelPath, cpuThreadNum, cpuPowerMode); - if (VOC_predictor_ == nullptr) { + vocoder_predictor_ = LoadModel(VocoderPath, cpuThreadNum, cpuPowerMode); + if (vocoder_predictor_ == nullptr) { return false; } + wav_sample_rate_ = wavSampleRate; + return true; } - ~Predictor() { + virtual ~Predictor() { ReleaseModel(); ReleaseWav(); } - std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, const std::string &cpuPowerMode) { + std::shared_ptr LoadModel( + const std::string &modelPath, + int cpuThreadNum, + PowerMode cpuPowerMode) override { if (modelPath.empty()) { return nullptr; } @@ -43,33 +108,17 @@ public: MobileConfig config; config.set_model_from_file(modelPath); config.set_threads(cpuThreadNum); - - if (cpuPowerMode == "LITE_POWER_HIGH") { - config.set_power_mode(PowerMode::LITE_POWER_HIGH); - } else if (cpuPowerMode == "LITE_POWER_LOW") { - config.set_power_mode(PowerMode::LITE_POWER_LOW); - } else if (cpuPowerMode == "LITE_POWER_FULL") { - config.set_power_mode(PowerMode::LITE_POWER_FULL); - } else if (cpuPowerMode == "LITE_POWER_NO_BIND") { - config.set_power_mode(PowerMode::LITE_POWER_NO_BIND); - } else if (cpuPowerMode == "LITE_POWER_RAND_HIGH") { - config.set_power_mode(PowerMode::LITE_POWER_RAND_HIGH); - } else if (cpuPowerMode == "LITE_POWER_RAND_LOW") { - config.set_power_mode(PowerMode::LITE_POWER_RAND_LOW); - } else { - std::cerr << "Unknown cpu power mode!" << std::endl; - return nullptr; - } + config.set_power_mode(cpuPowerMode); return CreatePaddlePredictor(config); } - void ReleaseModel() { - AM_predictor_ = nullptr; - VOC_predictor_ = nullptr; + void ReleaseModel() override { + acoustic_model_predictor_ = nullptr; + vocoder_predictor_ = nullptr; } - bool RunModel(const std::vector &phones) { + bool RunModel(const std::vector &phones) override { if (!IsLoaded()) { return false; } @@ -78,28 +127,29 @@ public: auto start = std::chrono::system_clock::now(); // 执行推理 - VOCOutputToWav(GetAMOutput(phones)); + VocoderOutputToWav(GetVocoderOutput(GetAcousticModelOutput(phones))); // 计时结束 auto end = std::chrono::system_clock::now(); // 计算用时 std::chrono::duration duration = end - start; - inference_time_ = duration.count() * 1000; // 单位:毫秒 + inference_time_ = duration.count() * 1000; // 单位:毫秒 return true; } - std::unique_ptr GetAMOutput(const std::vector &phones) { - auto phones_handle = AM_predictor_->GetInput(0); + std::unique_ptr GetAcousticModelOutput( + const std::vector &phones) override { + auto phones_handle = acoustic_model_predictor_->GetInput(0); phones_handle->Resize({static_cast(phones.size())}); phones_handle->CopyFromCpu(phones.data()); - AM_predictor_->Run(); + acoustic_model_predictor_->Run(); // 获取输出Tensor - auto am_output_handle = AM_predictor_->GetOutput(0); + auto am_output_handle = acoustic_model_predictor_->GetOutput(0); // 打印输出Tensor的shape - std::cout << "AM Output shape: "; + std::cout << "Acoustic Model Output shape: "; auto shape = am_output_handle->shape(); for (auto s : shape) { std::cout << s << ", "; @@ -109,75 +159,91 @@ public: return am_output_handle; } - void VOCOutputToWav(std::unique_ptr &&input) { - auto mel_handle = VOC_predictor_->GetInput(0); + std::unique_ptr GetVocoderOutput( + std::unique_ptr &&amOutput) override { + auto mel_handle = vocoder_predictor_->GetInput(0); // [?, 80] - auto dims = input->shape(); + auto dims = amOutput->shape(); mel_handle->Resize(dims); - auto am_output_data = input->mutable_data(); + auto am_output_data = amOutput->mutable_data(); mel_handle->CopyFromCpu(am_output_data); - VOC_predictor_->Run(); + vocoder_predictor_->Run(); // 获取输出Tensor - auto voc_output_handle = VOC_predictor_->GetOutput(0); + auto voc_output_handle = vocoder_predictor_->GetOutput(0); // 打印输出Tensor的shape - std::cout << "VOC Output shape: "; + std::cout << "Vocoder Output shape: "; auto shape = voc_output_handle->shape(); for (auto s : shape) { std::cout << s << ", "; } std::cout << std::endl; + return voc_output_handle; + } + + void VocoderOutputToWav( + std::unique_ptr &&vocOutput) override { // 获取输出Tensor的数据 int64_t output_size = 1; - for (auto dim : voc_output_handle->shape()) { + for (auto dim : vocOutput->shape()) { output_size *= dim; } - auto output_data = voc_output_handle->mutable_data(); + auto output_data = vocOutput->mutable_data(); SaveFloatWav(output_data, output_size); } - inline float Abs(float number) { - return (number < 0) ? -number : number; - } + void SaveFloatWav(float *floatWav, int64_t size) override; - void SaveFloatWav(float *floatWav, int64_t size) { - wav_.resize(size); - float maxSample = 0.01; - // 寻找最大采样值 - for (int64_t i=0; i maxSample) { - maxSample = sample; - } - } - // 把采样值缩放到 int_16 范围 - for (int64_t i=0; i &GetWav() { return wav_; } - const std::vector & GetWav() { - return wav_; - } + int GetWavSize() override { return wav_.size() * sizeof(WavDataType); } - int GetWavSize() { - return wav_.size() * sizeof(WavDataType); + // 获取WAV持续时间(单位:毫秒) + float GetWavDuration() override { + return static_cast(GetWavSize()) / sizeof(WavDataType) / + static_cast(wav_sample_rate_) * 1000; } - void ReleaseWav() { - wav_.clear(); + // 获取RTF(合成时间 / 音频时长) + float GetRTF() override { return GetInferenceTime() / GetWavDuration(); } + + void ReleaseWav() override { wav_.clear(); } + + bool WriteWavToFile(const std::string &wavPath) override { + std::ofstream fout(wavPath, std::ios::binary); + if (!fout.is_open()) { + return false; + } + + // 写入头信息 + WavHeader header; + header.audio_format = GetWavAudioFormat(); + header.data_size = GetWavSize(); + header.size = sizeof(header) - 8 + header.data_size; + header.sample_rate = wav_sample_rate_; + header.byte_rate = header.sample_rate * header.num_channels * + header.bits_per_sample / 8; + header.block_align = header.num_channels * header.bits_per_sample / 8; + fout.write(reinterpret_cast(&header), sizeof(header)); + + // 写入wav数据 + fout.write(reinterpret_cast(wav_.data()), + header.data_size); + + fout.close(); + return true; } + protected: struct WavHeader { // RIFF 头 char riff[4] = {'R', 'I', 'F', 'F'}; @@ -187,15 +253,11 @@ public: // FMT 头 char fmt[4] = {'f', 'm', 't', ' '}; uint32_t fmt_size = 16; - uint16_t audio_format = 1; // 1为整数编码,3为浮点编码 + uint16_t audio_format = 0; uint16_t num_channels = 1; - - // 如果播放速度和音调异常,请修改采样率 - // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 - uint32_t sample_rate = 24000; - - uint32_t byte_rate = 64000; - uint16_t block_align = 2; + uint32_t sample_rate = 0; + uint32_t byte_rate = 0; + uint16_t block_align = 0; uint16_t bits_per_sample = sizeof(WavDataType) * 8; // DATA 头 @@ -203,30 +265,56 @@ public: uint32_t data_size = 0; }; - bool WriteWavToFile(const std::string &wavPath) { - std::ofstream fout(wavPath, std::ios::binary); - if (!fout.is_open()) { - return false; - } - - // 写入头信息 - WavHeader header; - header.data_size = GetWavSize(); - header.size = sizeof(header) - 8 + header.data_size; - header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8; - header.block_align = header.num_channels * header.bits_per_sample / 8; - fout.write(reinterpret_cast(&header), sizeof(header)); + enum WavAudioFormat { + WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式 + WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式 + }; - // 写入wav数据 - fout.write(reinterpret_cast(wav_.data()), header.data_size); + protected: + // 返回值通过模板特化由 WavDataType 决定 + inline uint16_t GetWavAudioFormat(); - fout.close(); - return true; - } + inline float Abs(float number) { return (number < 0) ? -number : number; } -private: + protected: float inference_time_ = 0; - std::shared_ptr AM_predictor_ = nullptr; - std::shared_ptr VOC_predictor_ = nullptr; + uint32_t wav_sample_rate_ = 0; std::vector wav_; + std::shared_ptr acoustic_model_predictor_ = nullptr; + std::shared_ptr vocoder_predictor_ = nullptr; }; + +template <> +uint16_t Predictor::GetWavAudioFormat() { + return Predictor::WAV_FORMAT_16BIT_PCM; +} + +template <> +uint16_t Predictor::GetWavAudioFormat() { + return Predictor::WAV_FORMAT_32BIT_FLOAT; +} + +// 保存 16-bit PCM 格式 WAV +template <> +void Predictor::SaveFloatWav(float *floatWav, int64_t size) { + wav_.resize(size); + float maxSample = 0.01; + // 寻找最大采样值 + for (int64_t i = 0; i < size; i++) { + float sample = Abs(floatWav[i]); + if (sample > maxSample) { + maxSample = sample; + } + } + // 把采样值缩放到 int_16 范围 + for (int64_t i = 0; i < size; i++) { + wav_[i] = floatWav[i] * 32767.0f / maxSample; + } +} + +// 保存 32-bit IEEE float 格式 WAV +template <> +void Predictor::SaveFloatWav(float *floatWav, int64_t size) { + wav_.resize(size); + std::copy_n(floatWav, size, wav_.data()); +} diff --git a/demos/TTSArmLinux/src/TTSCppFrontend b/demos/TTSArmLinux/src/TTSCppFrontend new file mode 120000 index 000000000..25953976d --- /dev/null +++ b/demos/TTSArmLinux/src/TTSCppFrontend @@ -0,0 +1 @@ +../../TTSCppFrontend/ \ No newline at end of file diff --git a/demos/TTSArmLinux/src/main.cc b/demos/TTSArmLinux/src/main.cc index 0bf78a7de..0b8e26bc4 100644 --- a/demos/TTSArmLinux/src/main.cc +++ b/demos/TTSArmLinux/src/main.cc @@ -1,72 +1,162 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include #include #include +#include #include -#include "paddle_api.h" +#include #include "Predictor.hpp" using namespace paddle::lite_api; -std::vector> sentencesToChoose = { - // 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。 - {261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141}, - // 009902 钱伟长想到上海来办学校是经过深思熟虑的。 - {174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45}, - // 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 - {182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168}, - // 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。 - {153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45}, - // 009905 这种车票和保险单捆绑出售属于重复性购买。 - {262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9}, - // 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 - {40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120}, - // 009907 观大势、谋大局、出大策始终是该院的办院方针。 - {70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51}, - // 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 - {182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20}, - // 009909 但是因为还没到退休年龄,只能掰着指头捱日子。 - {40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112}, - // 009910 这几天雨水不断,人们恨不得待在家里不出门。 - {262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52}, -}; - -void usage(const char *binName) { - std::cerr << "Usage:" << std::endl - << "\t" << binName << " " << std::endl; -} +DEFINE_string( + sentence, + "你好,欢迎使用语音合成服务", + "Text to be synthesized (Chinese only. English will crash the program.)"); +DEFINE_string(front_conf, "./front.conf", "Front configuration file"); +DEFINE_string(acoustic_model, + "./models/cpu/fastspeech2_csmsc_arm.nb", + "Acoustic model .nb file"); +DEFINE_string(vocoder, + "./models/cpu/fastspeech2_csmsc_arm.nb", + "vocoder .nb file"); +DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file"); +DEFINE_string(wav_bit_depth, + "16", + "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)"); +DEFINE_string(wav_sample_rate, + "24000", + "WAV sample rate, should match the output of the vocoder"); +DEFINE_string(cpu_thread, "1", "CPU thread numbers"); int main(int argc, char *argv[]) { - if (argc < 5) { - usage(argv[0]); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + PredictorInterface *predictor; + + if (FLAGS_wav_bit_depth == "16") { + predictor = new Predictor(); + } else if (FLAGS_wav_bit_depth == "32") { + predictor = new Predictor(); + } else { + LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth; return -1; } - const char *AMModelPath = argv[1]; - const char *VOCModelPath = argv[2]; - int sentencesIndex = atoi(argv[3]) - 1; - const char *outputWavPath = argv[4]; - if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) { - std::cerr << "sentences-index out of range" << std::endl; + + /////////////////////////// 前端:文本转音素 /////////////////////////// + + // 实例化文本前端引擎 + ppspeech::FrontEngineInterface *front_inst = nullptr; + front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf); + if ((!front_inst) || (front_inst->init())) { + LOG(ERROR) << "Creater tts engine failed!"; + if (front_inst != nullptr) { + delete front_inst; + } + front_inst = nullptr; return -1; } - Predictor predictor; - if (!predictor.Init(AMModelPath, VOCModelPath, 1, "LITE_POWER_HIGH")) { - std::cerr << "predictor init failed" << std::endl; + std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence); + + // 繁体转简体 + std::wstring sentence_simp; + front_inst->Trand2Simp(ws_sentence, &sentence_simp); + ws_sentence = sentence_simp; + + std::string s_sentence; + std::vector sentence_part; + std::vector phoneids = {}; + std::vector toneids = {}; + + // 根据标点进行分句 + LOG(INFO) << "Start to segment sentences by punctuation"; + front_inst->SplitByPunc(ws_sentence, &sentence_part); + LOG(INFO) << "Segment sentences through punctuation successfully"; + + // 分句后获取音素id + LOG(INFO) + << "Start to get the phoneme and tone id sequence of each sentence"; + for (int i = 0; i < sentence_part.size(); i++) { + LOG(INFO) << "Raw sentence is: " + << ppspeech::wstring2utf8string(sentence_part[i]); + front_inst->SentenceNormalize(&sentence_part[i]); + s_sentence = ppspeech::wstring2utf8string(sentence_part[i]); + LOG(INFO) << "After normalization sentence is: " << s_sentence; + + if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) { + LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; + return -1; + } + } + LOG(INFO) << "The phoneids of the sentence is: " + << limonp::Join(phoneids.begin(), phoneids.end(), " "); + LOG(INFO) << "The toneids of the sentence is: " + << limonp::Join(toneids.begin(), toneids.end(), " "); + LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; + + + /////////////////////////// 后端:音素转音频 /////////////////////////// + + // WAV采样率(必须与模型输出匹配) + // 如果播放速度和音调异常,请修改采样率 + // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 + const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate); + + // CPU线程数 + const int cpuThreadNum = std::stol(FLAGS_cpu_thread); + + // CPU电源模式 + const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH; + + if (!predictor->Init(FLAGS_acoustic_model, + FLAGS_vocoder, + cpuPowerMode, + cpuThreadNum, + wavSampleRate)) { + LOG(ERROR) << "predictor init failed" << std::endl; return -1; } - if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) { - std::cerr << "predictor run model failed" << std::endl; + std::vector phones(phoneids.size()); + std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { + return static_cast(x); + }); + + if (!predictor->RunModel(phones)) { + LOG(ERROR) << "predictor run model failed" << std::endl; return -1; } - std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, " - << "WAV size (without header): " << predictor.GetWavSize() << " bytes" << std::endl; + LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, " + << "WAV size (without header): " << predictor->GetWavSize() + << " bytes, " + << "WAV duration: " << predictor->GetWavDuration() << " ms, " + << "RTF: " << predictor->GetRTF() << std::endl; - if (!predictor.WriteWavToFile(outputWavPath)) { - std::cerr << "write wav file failed" << std::endl; + if (!predictor->WriteWavToFile(FLAGS_output_wav)) { + LOG(ERROR) << "write wav file failed" << std::endl; return -1; } + delete predictor; + return 0; } diff --git a/demos/TTSArmLinux/src/third-party b/demos/TTSArmLinux/src/third-party new file mode 120000 index 000000000..851b2c1ec --- /dev/null +++ b/demos/TTSArmLinux/src/third-party @@ -0,0 +1 @@ +TTSCppFrontend/third-party \ No newline at end of file diff --git a/demos/TTSCppFrontend/.gitignore b/demos/TTSCppFrontend/.gitignore new file mode 100644 index 000000000..0075a9011 --- /dev/null +++ b/demos/TTSCppFrontend/.gitignore @@ -0,0 +1,2 @@ +build/ +dict/ diff --git a/demos/TTSCppFrontend/CMakeLists.txt b/demos/TTSCppFrontend/CMakeLists.txt new file mode 100644 index 000000000..14245372b --- /dev/null +++ b/demos/TTSCppFrontend/CMakeLists.txt @@ -0,0 +1,63 @@ +cmake_minimum_required(VERSION 3.10) +project(paddlespeech_tts_cpp) + + +########## Global Options ########## + +option(WITH_FRONT_DEMO "Build front demo" ON) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(ABSL_PROPAGATE_CXX_STD ON) + + +########## Dependencies ########## + +set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/third-party/build/lib/pkgconfig:${CMAKE_SOURCE_DIR}/third-party/build/lib64/pkgconfig") +find_package(PkgConfig REQUIRED) + +# It is hard to load xxx-config.cmake in a custom location, so use pkgconfig instead. +pkg_check_modules(ABSL REQUIRED absl_strings IMPORTED_TARGET) +pkg_check_modules(GFLAGS REQUIRED gflags IMPORTED_TARGET) +pkg_check_modules(GLOG REQUIRED libglog IMPORTED_TARGET) + +# load header-only libraries +include_directories( + ${CMAKE_SOURCE_DIR}/third-party/build/src/cppjieba/include + ${CMAKE_SOURCE_DIR}/third-party/build/src/limonp/include +) + +find_package(Threads REQUIRED) + + +########## paddlespeech_tts_front ########## + +include_directories(src) + +file(GLOB FRONT_SOURCES + ./src/base/*.cpp + ./src/front/*.cpp +) +add_library(paddlespeech_tts_front STATIC ${FRONT_SOURCES}) + +target_link_libraries( + paddlespeech_tts_front + PUBLIC + PkgConfig::GFLAGS + PkgConfig::GLOG + PkgConfig::ABSL + Threads::Threads +) + + +########## tts_front_demo ########## + +if (WITH_FRONT_DEMO) + + file(GLOB FRONT_DEMO_SOURCES front_demo/*.cpp) + add_executable(tts_front_demo ${FRONT_DEMO_SOURCES}) + + target_include_directories(tts_front_demo PRIVATE ./front_demo) + target_link_libraries(tts_front_demo PRIVATE paddlespeech_tts_front) + +endif (WITH_FRONT_DEMO) diff --git a/demos/TTSCppFrontend/README.md b/demos/TTSCppFrontend/README.md new file mode 100644 index 000000000..c179fdd04 --- /dev/null +++ b/demos/TTSCppFrontend/README.md @@ -0,0 +1,56 @@ +# PaddleSpeech TTS CPP Frontend + +A TTS frontend that implements text-to-phoneme conversion. + +Currently it only supports Chinese, any English word will crash the demo. + +## Install Build Tools + +```bash +# Ubuntu +sudo apt install build-essential cmake pkg-config + +# CentOS +sudo yum groupinstall "Development Tools" +sudo yum install cmake +``` + +If your cmake version is too old, you can go here to download a precompiled new version: https://cmake.org/download/ + +## Build + +```bash +# Build with all CPU cores +./build.sh + +# Build with 1 core +./build.sh -j1 +``` + +Dependent libraries will be automatically downloaded to the `third-party/build` folder. + +If the download speed is too slow, you can open [third-party/CMakeLists.txt](third-party/CMakeLists.txt) and modify `GIT_REPOSITORY` URLs. + +## Download dictionary files + +```bash +./download.sh +``` + +## Run +You can change `--phone2id_path` in `./front_demo/front.conf` to the `phone_id_map.txt` of your own acoustic model. + +```bash +./run_front_demo.sh +./run_front_demo.sh --help +./run_front_demo.sh --sentence "这是语音合成服务的文本前端,用于将文本转换为音素序号数组。" +./run_front_demo.sh --front_conf ./front_demo/front.conf --sentence "你还需要一个语音合成后端才能将其转换为实际的声音。" +``` + +## Clean + +```bash +./clean.sh +``` + +The folders `front_demo/dict`, `build` and `third-party/build` will be deleted. diff --git a/demos/TTSCppFrontend/build-depends.sh b/demos/TTSCppFrontend/build-depends.sh new file mode 100755 index 000000000..c5f2ca125 --- /dev/null +++ b/demos/TTSCppFrontend/build-depends.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +set -x + +cd "$(dirname "$(realpath "$0")")" + +cd ./third-party + +mkdir -p build +cd build + +cmake .. + +if [ "$*" = "" ]; then + make -j$(nproc) +else + make "$@" +fi + +echo "Done." diff --git a/demos/TTSCppFrontend/build.sh b/demos/TTSCppFrontend/build.sh new file mode 100755 index 000000000..a136cb936 --- /dev/null +++ b/demos/TTSCppFrontend/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e +set -x + +cd "$(dirname "$(realpath "$0")")" + +echo "************* Download & Build Dependencies *************" +./build-depends.sh "$@" + +echo "************* Build Front Lib and Demo *************" +mkdir -p ./build +cd ./build +cmake .. + +if [ "$*" = "" ]; then + make -j$(nproc) +else + make "$@" +fi + +echo "Done." diff --git a/demos/TTSCppFrontend/clean.sh b/demos/TTSCppFrontend/clean.sh new file mode 100755 index 000000000..efbb28871 --- /dev/null +++ b/demos/TTSCppFrontend/clean.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -e +set -x + +cd "$(dirname "$(realpath "$0")")" +rm -rf "./front_demo/dict" +rm -rf "./build" +rm -rf "./third-party/build" + +echo "Done." diff --git a/demos/TTSCppFrontend/download.sh b/demos/TTSCppFrontend/download.sh new file mode 100755 index 000000000..0953e3a59 --- /dev/null +++ b/demos/TTSCppFrontend/download.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -e + +cd "$(dirname "$(realpath "$0")")" + +download() { + file="$1" + url="$2" + md5="$3" + dir="$4" + + cd "$dir" + + if [ -f "$file" ] && [ "$(md5sum "$file" | awk '{ print $1 }')" = "$md5" ]; then + echo "File $file (MD5: $md5) has been downloaded." + else + echo "Downloading $file..." + wget -O "$file" "$url" + + # MD5 verify + fileMd5="$(md5sum "$file" | awk '{ print $1 }')" + if [ "$fileMd5" == "$md5" ]; then + echo "File $file (MD5: $md5) has been downloaded." + else + echo "MD5 mismatch, file may be corrupt" + echo "$file MD5: $fileMd5, it should be $md5" + fi + fi + + echo "Extracting $file..." + echo '-----------------------' + tar -vxf "$file" + echo '=======================' +} + +######################################## + +DIST_DIR="$PWD/front_demo/dict" + +mkdir -p "$DIST_DIR" + +download 'fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \ + 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \ + '7bf1bab1737375fa123c413eb429c573' \ + "$DIST_DIR" + +download 'speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \ + 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \ + '0b7754b21f324789aef469c61f4d5b8f' \ + "$DIST_DIR" + +download 'jieba.tar.gz' \ + 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/jieba.tar.gz' \ + '6d30f426bd8c0025110a483f051315ca' \ + "$DIST_DIR" + +download 'tranditional_to_simplified.tar.gz' \ + 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/tranditional_to_simplified.tar.gz' \ + '258f5b59d5ebfe96d02007ca1d274a7f' \ + "$DIST_DIR" + +echo "Done." diff --git a/demos/TTSCppFrontend/front_demo/front.conf b/demos/TTSCppFrontend/front_demo/front.conf new file mode 100644 index 000000000..e9ce1c94d --- /dev/null +++ b/demos/TTSCppFrontend/front_demo/front.conf @@ -0,0 +1,21 @@ +# jieba conf +--jieba_dict_path=./front_demo/dict/jieba/jieba.dict.utf8 +--jieba_hmm_path=./front_demo/dict/jieba/hmm_model.utf8 +--jieba_user_dict_path=./front_demo/dict/jieba/user.dict.utf8 +--jieba_idf_path=./front_demo/dict/jieba/idf.utf8 +--jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8 + +# dict conf fastspeech2_0.4 +--seperate_tone=false +--word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict +--phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +--tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict + +# dict conf speedyspeech_0.5 +#--seperate_tone=true +#--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict +#--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt +#--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt + +# dict of tranditional_to_simplified +--trand2simpd_path=./front_demo/dict/tranditional_to_simplified/trand2simp.txt diff --git a/demos/TTSCppFrontend/front_demo/front_demo.cpp b/demos/TTSCppFrontend/front_demo/front_demo.cpp new file mode 100644 index 000000000..19f16758b --- /dev/null +++ b/demos/TTSCppFrontend/front_demo/front_demo.cpp @@ -0,0 +1,79 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "front/front_interface.h" + +DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized"); +DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file"); +// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid"); + + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + // 实例化文本前端引擎 + ppspeech::FrontEngineInterface* front_inst = nullptr; + front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf); + if ((!front_inst) || (front_inst->init())) { + LOG(ERROR) << "Creater tts engine failed!"; + if (front_inst != nullptr) { + delete front_inst; + } + front_inst = nullptr; + return -1; + } + + std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence); + + // 繁体转简体 + std::wstring sentence_simp; + front_inst->Trand2Simp(ws_sentence, &sentence_simp); + ws_sentence = sentence_simp; + + std::string s_sentence; + std::vector sentence_part; + std::vector phoneids = {}; + std::vector toneids = {}; + + // 根据标点进行分句 + LOG(INFO) << "Start to segment sentences by punctuation"; + front_inst->SplitByPunc(ws_sentence, &sentence_part); + LOG(INFO) << "Segment sentences through punctuation successfully"; + + // 分句后获取音素id + LOG(INFO) + << "Start to get the phoneme and tone id sequence of each sentence"; + for (int i = 0; i < sentence_part.size(); i++) { + LOG(INFO) << "Raw sentence is: " + << ppspeech::wstring2utf8string(sentence_part[i]); + front_inst->SentenceNormalize(&sentence_part[i]); + s_sentence = ppspeech::wstring2utf8string(sentence_part[i]); + LOG(INFO) << "After normalization sentence is: " << s_sentence; + + if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) { + LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; + return -1; + } + } + LOG(INFO) << "The phoneids of the sentence is: " + << limonp::Join(phoneids.begin(), phoneids.end(), " "); + LOG(INFO) << "The toneids of the sentence is: " + << limonp::Join(toneids.begin(), toneids.end(), " "); + LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; + + return EXIT_SUCCESS; +} diff --git a/demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py b/demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py new file mode 100644 index 000000000..5aaa6e345 --- /dev/null +++ b/demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import configparser + +from paddlespeech.t2s.frontend.zh_frontend import Frontend + + +def get_phone(frontend, + word, + merge_sentences=True, + print_info=False, + robot=False, + get_tone_ids=False): + phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot) + # Some optimizations + phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids) + #print(type(phones), phones) + #print(type(tones), tones) + return phones, tones + + +def gen_word2phone_dict(frontend, + jieba_words_dict, + word2phone_dict, + get_tone=False): + with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2: + for line in f1.readlines(): + word = line.split(" ")[0] + phone, tone = get_phone(frontend, word, get_tone_ids=get_tone) + phone_str = "" + + if tone: + assert (len(phone) == len(tone)) + for i in range(len(tone)): + phone_tone = phone[i] + tone[i] + phone_str += (" " + phone_tone) + phone_str = phone_str.strip("sp0").strip(" ") + else: + for x in phone: + phone_str += (" " + x) + phone_str = phone_str.strip("sp").strip(" ") + print(phone_str) + f2.write(word + " " + phone_str + "\n") + print("Generate word2phone dict successfully.") + + +def main(): + parser = argparse.ArgumentParser(description="Generate dictionary") + parser.add_argument( + "--config", type=str, default="./config.ini", help="config file.") + parser.add_argument( + "--am_type", + type=str, + default="fastspeech2", + help="fastspeech2 or speedyspeech") + args = parser.parse_args() + + # Read config + cf = configparser.ConfigParser() + cf.read(args.config) + jieba_words_dict_file = cf.get("jieba", + "jieba_words_dict") # get words dict + + am_type = args.am_type + if (am_type == "fastspeech2"): + phone2id_dict_file = cf.get(am_type, "phone2id_dict") + word2phone_dict_file = cf.get(am_type, "word2phone_dict") + + frontend = Frontend(phone_vocab_path=phone2id_dict_file) + print("frontend done!") + + gen_word2phone_dict( + frontend, + jieba_words_dict_file, + word2phone_dict_file, + get_tone=False) + + elif (am_type == "speedyspeech"): + phone2id_dict_file = cf.get(am_type, "phone2id_dict") + tone2id_dict_file = cf.get(am_type, "tone2id_dict") + word2phone_dict_file = cf.get(am_type, "word2phone_dict") + + frontend = Frontend( + phone_vocab_path=phone2id_dict_file, + tone_vocab_path=tone2id_dict_file) + print("frontend done!") + + gen_word2phone_dict( + frontend, + jieba_words_dict_file, + word2phone_dict_file, + get_tone=True) + + else: + print("Please set correct am type, fastspeech2 or speedyspeech.") + + +if __name__ == "__main__": + main() diff --git a/demos/TTSCppFrontend/front_demo/gentools/genid.py b/demos/TTSCppFrontend/front_demo/gentools/genid.py new file mode 100644 index 000000000..cf83623f0 --- /dev/null +++ b/demos/TTSCppFrontend/front_demo/gentools/genid.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PHONESFILE = "./dict/phones.txt" +PHONES_ID_FILE = "./dict/phonesid.dict" +TONESFILE = "./dict/tones.txt" +TONES_ID_FILE = "./dict/tonesid.dict" + + +def GenIdFile(file, idfile): + id = 2 + with open(file, 'r') as f1, open(idfile, "w+") as f2: + f2.write(" 0\n") + f2.write(" 1\n") + for line in f1.readlines(): + phone = line.strip() + print(phone + " " + str(id) + "\n") + f2.write(phone + " " + str(id) + "\n") + id += 1 + + +if __name__ == "__main__": + GenIdFile(PHONESFILE, PHONES_ID_FILE) + GenIdFile(TONESFILE, TONES_ID_FILE) diff --git a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py new file mode 100644 index 000000000..8726ee89c --- /dev/null +++ b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py @@ -0,0 +1,55 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from pypinyin import lazy_pinyin +from pypinyin import Style + +worddict = "./dict/jieba_part.dict.utf8" +newdict = "./dict/word_phones.dict" + + +def GenPhones(initials, finals, seperate=True): + + phones = [] + for c, v in zip(initials, finals): + if re.match(r'i\d', v): + if c in ['z', 'c', 's']: + v = re.sub('i', 'ii', v) + elif c in ['zh', 'ch', 'sh', 'r']: + v = re.sub('i', 'iii', v) + if c: + if seperate is True: + phones.append(c + '0') + elif seperate is False: + phones.append(c) + else: + print("Not sure whether phone and tone need to be separated") + if v: + phones.append(v) + return phones + + +with open(worddict, "r") as f1, open(newdict, "w+") as f2: + for line in f1.readlines(): + word = line.split(" ")[0] + initials = lazy_pinyin( + word, neutral_tone_with_five=True, style=Style.INITIALS) + finals = lazy_pinyin( + word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + + phones = GenPhones(initials, finals, True) + + temp = " ".join(phones) + f2.write(word + " " + temp + "\n") diff --git a/demos/TTSCppFrontend/run_front_demo.sh b/demos/TTSCppFrontend/run_front_demo.sh new file mode 100755 index 000000000..4dcded5c1 --- /dev/null +++ b/demos/TTSCppFrontend/run_front_demo.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -e +set -x + +cd "$(dirname "$(realpath "$0")")" + +./build/tts_front_demo "$@" diff --git a/demos/TTSCppFrontend/src/base/type_conv.cpp b/demos/TTSCppFrontend/src/base/type_conv.cpp new file mode 100644 index 000000000..b7ff63642 --- /dev/null +++ b/demos/TTSCppFrontend/src/base/type_conv.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "base/type_conv.h" + +namespace ppspeech { +// wstring to string +std::string wstring2utf8string(const std::wstring& str) { + static std::wstring_convert> strCnv; + return strCnv.to_bytes(str); +} + +// string to wstring +std::wstring utf8string2wstring(const std::string& str) { + static std::wstring_convert> strCnv; + return strCnv.from_bytes(str); +} +} // namespace ppspeech diff --git a/demos/TTSCppFrontend/src/base/type_conv.h b/demos/TTSCppFrontend/src/base/type_conv.h new file mode 100644 index 000000000..6aecfc438 --- /dev/null +++ b/demos/TTSCppFrontend/src/base/type_conv.h @@ -0,0 +1,31 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BASE_TYPE_CONVC_H +#define BASE_TYPE_CONVC_H + +#include +#include +#include + + +namespace ppspeech { +// wstring to string +std::string wstring2utf8string(const std::wstring& str); + +// string to wstring +std::wstring utf8string2wstring(const std::string& str); +} + +#endif // BASE_TYPE_CONVC_H \ No newline at end of file diff --git a/demos/TTSCppFrontend/src/front/front_interface.cpp b/demos/TTSCppFrontend/src/front/front_interface.cpp new file mode 100644 index 000000000..8bd466d28 --- /dev/null +++ b/demos/TTSCppFrontend/src/front/front_interface.cpp @@ -0,0 +1,1130 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "front/front_interface.h" + +namespace ppspeech { + +int FrontEngineInterface::init() { + if (_initialed) { + return 0; + } + if (0 != ReadConfFile()) { + LOG(ERROR) << "Read front conf file failed"; + return -1; + } + + _jieba = new cppjieba::Jieba(_jieba_dict_path, + _jieba_hmm_path, + _jieba_user_dict_path, + _jieba_idf_path, + _jieba_stop_word_path); + + _punc = {",", + "。", + "、", + "?", + ":", + ";", + "~", + "!", + ",", + ".", + "?", + "!", + ":", + ";", + "/", + "\\"}; + _punc_omit = {"“", "”", "\"", "\""}; + + // 需要儿化音处理的词语 + must_erhua = { + "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}; + not_erhua = {"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", + "有儿", "一儿", "我儿", "俺儿", "妻儿", "拐儿", + "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", + "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿", + "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", + "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", + "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", + "狗儿"}; + + must_not_neural_tone_words = { + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"}; + // 需要轻声处理的词语 + must_neural_tone_words = { + "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", + "馄饨", "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", + "铃铛", "铁匠", "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化", + "迷糊", "连累", "这么", "这个", "运气", "过去", "软和", "转悠", "踏实", + "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号", "认识", + "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李", + "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条", + "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", "膏药", "脾气", + "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", "聪明", + "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太", + "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", + "精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", + "笑话", "窟窿", "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", + "秀才", "福气", "祖宗", "砚台", "码头", "石榴", "石头", "石匠", "知识", + "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算", "白净", "痢疾", "痛快", + "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨", "琉璃", + "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼", + "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼", + "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", "活泼", + "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃", + "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠", + "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", + "收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", + "招牌", "招呼", "抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点", + "打扮", "打听", "打发", "扎实", "扁担", "戒指", "懒得", "意识", "意思", + "情形", "悟性", "怪物", "思量", "怎么", "念头", "念叨", "快活", "忙活", + "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼", "干事", + "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫", + "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付", + "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆", + "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当", + "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫", + "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", + "嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", + "哈欠", "哆嗦", "咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头", + "名字", "名堂", "合同", "吆喝", "叫唤", "口袋", "厚道", "厉害", "千斤", + "包袱", "包涵", "匀称", "勤快", "动静", "动弹", "功夫", "力气", "前头", + "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息", "凑合", + "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜", + "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", + "人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头", + "丧气", "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴", + "上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲", + "咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", + "幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他", + "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑", + "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记"}; + + + // 生成词典(词到音素的映射) + if (0 != GenDict(_word2phone_path, &word_phone_map)) { + LOG(ERROR) << "Genarate word2phone dict failed"; + return -1; + } + + // 生成音素字典(音素到音素id的映射) + if (0 != GenDict(_phone2id_path, &phone_id_map)) { + LOG(ERROR) << "Genarate phone2id dict failed"; + return -1; + } + + // 生成音调字典(音调到音调id的映射) + if (_seperate_tone == "true") { + if (0 != GenDict(_tone2id_path, &tone_id_map)) { + LOG(ERROR) << "Genarate tone2id dict failed"; + return -1; + } + } + + // 生成繁简字典(繁体到简体id的映射) + if (0 != GenDict(_trand2simp_path, &trand_simp_map)) { + LOG(ERROR) << "Genarate trand2simp dict failed"; + return -1; + } + + _initialed = true; + return 0; +} + +int FrontEngineInterface::ReadConfFile() { + std::ifstream is(_conf_file.c_str(), std::ifstream::in); + if (!is.good()) { + LOG(ERROR) << "Cannot open config file: " << _conf_file; + return -1; + } + std::string line, key, value; + while (std::getline(is, line)) { + if (line.substr(0, 2) == "--") { + size_t pos = line.find_first_of("=", 0); + std::string key = line.substr(2, pos - 2); + std::string value = line.substr(pos + 1); + conf_map[key] = value; + LOG(INFO) << "Key: " << key << "; Value: " << value; + } + } + + // jieba conf path + _jieba_dict_path = conf_map["jieba_dict_path"]; + _jieba_hmm_path = conf_map["jieba_hmm_path"]; + _jieba_user_dict_path = conf_map["jieba_user_dict_path"]; + _jieba_idf_path = conf_map["jieba_idf_path"]; + _jieba_stop_word_path = conf_map["jieba_stop_word_path"]; + + // dict path + _seperate_tone = conf_map["seperate_tone"]; + _word2phone_path = conf_map["word2phone_path"]; + _phone2id_path = conf_map["phone2id_path"]; + _tone2id_path = conf_map["tone2id_path"]; + _trand2simp_path = conf_map["trand2simpd_path"]; + + return 0; +} + +int FrontEngineInterface::Trand2Simp(const std::wstring &sentence, + std::wstring *sentence_simp) { + // sentence_simp = sentence; + for (int i = 0; i < sentence.length(); i++) { + std::wstring temp(1, sentence[i]); + std::string sigle_word = ppspeech::wstring2utf8string(temp); + // 单个字是否在繁转简的字典里 + if (trand_simp_map.find(sigle_word) == trand_simp_map.end()) { + sentence_simp->append(temp); + } else { + sentence_simp->append( + (ppspeech::utf8string2wstring(trand_simp_map[sigle_word]))); + } + } + + return 0; +} + +int FrontEngineInterface::GenDict(const std::string &dict_file, + std::map *map) { + std::ifstream is(dict_file.c_str(), std::ifstream::in); + if (!is.good()) { + LOG(ERROR) << "Cannot open dict file: " << dict_file; + return -1; + } + std::string line, key, value; + while (std::getline(is, line)) { + size_t pos = line.find_first_of(" ", 0); + key = line.substr(0, pos); + value = line.substr(pos + 1); + (*map)[key] = value; + } + return 0; +} + +int FrontEngineInterface::GetSegResult( + std::vector> *seg, + std::vector *seg_words) { + std::vector>::iterator iter; + for (iter = seg->begin(); iter != seg->end(); iter++) { + seg_words->push_back((*iter).first); + } + return 0; +} + +int FrontEngineInterface::GetSentenceIds(const std::string &sentence, + std::vector *phoneids, + std::vector *toneids) { + std::vector> + cut_result; //分词结果包含词和词性 + if (0 != Cut(sentence, &cut_result)) { + LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed"; + return -1; + } + + if (0 != GetWordsIds(cut_result, phoneids, toneids)) { + LOG(ERROR) << "Get words phoneids failed"; + return -1; + } + return 0; +} + +int FrontEngineInterface::GetWordsIds( + const std::vector> &cut_result, + std::vector *phoneids, + std::vector *toneids) { + std::string word; + std::string pos; + std::vector word_initials; + std::vector word_finals; + std::string phone; + for (int i = 0; i < cut_result.size(); i++) { + word = cut_result[i].first; + pos = cut_result[i].second; + if (std::find(_punc_omit.begin(), _punc_omit.end(), word) == + _punc_omit.end()) { // 非可忽略的标点 + word_initials = {}; + word_finals = {}; + phone = ""; + // 判断是否在标点符号集合中 + if (std::find(_punc.begin(), _punc.end(), word) == + _punc.end()) { // 文字 + // 获取字词的声母韵母列表 + if (0 != + GetInitialsFinals(word, &word_initials, &word_finals)) { + LOG(ERROR) + << "Genarate the word_initials and word_finals of " + << word << " failed"; + return -1; + } + + // 对读音进行修改 + if (0 != ModifyTone(word, pos, &word_finals)) { + LOG(ERROR) << "Failed to modify tone."; + } + + // 对儿化音进行修改 + std::vector> new_initals_finals = + MergeErhua(word_initials, word_finals, word, pos); + word_initials = new_initals_finals[0]; + word_finals = new_initals_finals[1]; + + // 将声母和韵母合并成音素 + assert(word_initials.size() == word_finals.size()); + std::string temp_phone; + for (int j = 0; j < word_initials.size(); j++) { + if (word_initials[j] != "") { + temp_phone = word_initials[j] + " " + word_finals[j]; + } else { + temp_phone = word_finals[j]; + } + if (j == 0) { + phone += temp_phone; + } else { + phone += (" " + temp_phone); + } + } + } else { // 标点符号 + if (_seperate_tone == "true") { + phone = "sp0"; // speedyspeech + } else { + phone = "sp"; // fastspeech2 + } + } + + // 音素到音素id + if (0 != Phone2Phoneid(phone, phoneids, toneids)) { + LOG(ERROR) << "Genarate the phone id of " << word << " failed"; + return -1; + } + } + } + return 0; +} + +int FrontEngineInterface::Cut( + const std::string &sentence, + std::vector> *cut_result) { + std::vector> cut_result_jieba; + + // 结巴分词 + _jieba->Tag(sentence, cut_result_jieba); + + // 对分词后结果进行整合 + if (0 != MergeforModify(&cut_result_jieba, cut_result)) { + LOG(ERROR) << "Failed to modify for word segmentation result."; + return -1; + } + + return 0; +} + +int FrontEngineInterface::GetPhone(const std::string &word, + std::string *phone) { + // 判断 word 在不在 词典里,如果不在,进行CutAll分词 + if (word_phone_map.find(word) == word_phone_map.end()) { + std::vector wordcut; + _jieba->CutAll(word, wordcut); + phone->assign(word_phone_map[wordcut[0]]); + for (int i = 1; i < wordcut.size(); i++) { + phone->assign((*phone) + (" " + word_phone_map[wordcut[i]])); + } + } else { + phone->assign(word_phone_map[word]); + } + + return 0; +} + +int FrontEngineInterface::Phone2Phoneid(const std::string &phone, + std::vector *phoneid, + std::vector *toneid) { + std::vector phone_vec; + phone_vec = absl::StrSplit(phone, " "); + std::string temp_phone; + for (int i = 0; i < phone_vec.size(); i++) { + temp_phone = phone_vec[i]; + if (_seperate_tone == "true") { + phoneid->push_back(atoi( + (phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)]) + .c_str())); + toneid->push_back( + atoi((tone_id_map[temp_phone.substr(temp_phone.length() - 1, + temp_phone.length())]) + .c_str())); + } else { + phoneid->push_back(atoi((phone_id_map[temp_phone]).c_str())); + } + } + return 0; +} + + +// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声 +bool FrontEngineInterface::AllToneThree( + const std::vector &finals) { + bool flags = true; + for (int i = 0; i < finals.size(); i++) { + if (static_cast(finals[i].back()) != 51) { //如果读音不为第三声 + flags = false; + } + } + return flags; +} + +// 判断词是否是叠词 +bool FrontEngineInterface::IsReduplication(const std::string &word) { + bool flags = false; + std::wstring word_wstr = ppspeech::utf8string2wstring(word); + int len = word_wstr.length(); + if (len == 2 && word_wstr[0] == word_wstr[1]) { + flags = true; + } + return flags; +} + +// 获取每个字词的声母和韵母列表, word_initials 为声母列表,word_finals +// 为韵母列表 +int FrontEngineInterface::GetInitialsFinals( + const std::string &word, + std::vector *word_initials, + std::vector *word_finals) { + std::string phone; + GetPhone(word, &phone); //获取字词对应的音素 + std::vector phone_vec = absl::StrSplit(phone, " "); + //获取韵母,每个字的音素有1或者2个,start为单个字音素的起始位置。 + int start = 0; + while (start < phone_vec.size()) { + if (phone_vec[start] == "sp" || phone_vec[start] == "sp0") { + start += 1; + } else if (isdigit(phone_vec[start].back()) == 0 || + static_cast(phone_vec[start].back()) == 48) { + word_initials->push_back(phone_vec[start]); + word_finals->push_back(phone_vec[start + 1]); + start += 2; + } else { + word_initials->push_back(""); + word_finals->push_back(phone_vec[start]); + start += 1; + } + } + + assert(word_finals->size() == ppspeech::utf8string2wstring(word).length() && + word_finals->size() == word_initials->size()); + + return 0; +} + +// 获取每个字词的韵母列表 +int FrontEngineInterface::GetFinals(const std::string &word, + std::vector *word_finals) { + std::vector word_initials; + if (0 != GetInitialsFinals(word, &word_initials, word_finals)) { + LOG(ERROR) << "Failed to get word finals"; + return -1; + } + + return 0; +} + +int FrontEngineInterface::Word2WordVec(const std::string &word, + std::vector *wordvec) { + std::wstring word_wstr = ppspeech::utf8string2wstring(word); + for (int i = 0; i < word_wstr.length(); i++) { + std::wstring word_sigle(1, word_wstr[i]); + wordvec->push_back(word_sigle); + } + return 0; +} + +// yuantian01解释:把一个词再进行分词找到。例子:小雨伞 --> 小 雨伞 或者 小雨 伞 +int FrontEngineInterface::SplitWord(const std::string &word, + std::vector *new_word_vec) { + std::vector word_vec; + std::string second_subword; + _jieba->CutForSearch(word, word_vec); + // 升序 + std::sort(word_vec.begin(), + word_vec.end(), + [](std::string a, std::string b) { return a.size() > b.size(); }); + std::string first_subword = word_vec[0]; // 提取长度最短的字符串 + int first_begin_idx = word.find_first_of(first_subword); + if (first_begin_idx == 0) { + second_subword = word.substr(first_subword.length()); + new_word_vec->push_back(first_subword); + new_word_vec->push_back(second_subword); + } else { + second_subword = word.substr(0, word.length() - first_subword.length()); + new_word_vec->push_back(second_subword); + new_word_vec->push_back(first_subword); + } + + return 0; +} + + +// example: 不 一起 --> 不一起 +std::vector> FrontEngineInterface::MergeBu( + std::vector> *seg_result) { + std::vector> result; + std::string word; + std::string pos; + std::string last_word = ""; + + for (int i = 0; i < seg_result->size(); i++) { + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + if (last_word == "不") { + word = last_word + word; + } + if (word != "不") { + result.push_back(make_pair(word, pos)); + } + last_word = word; + } + + if (last_word == "不") { + result.push_back(make_pair(last_word, "d")); + last_word = ""; + } + + return result; +} + +std::vector> FrontEngineInterface::Mergeyi( + std::vector> *seg_result) { + std::vector> *result_temp = + new std::vector>(); + std::string word; + std::string pos; + // function 1 example: 听 一 听 --> 听一听 + for (int i = 0; i < seg_result->size(); i++) { + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + + if ((i - 1 >= 0) && (word == "一") && (i + 1 < seg_result->size()) && + (std::get<0>((*seg_result)[i - 1]) == + std::get<0>((*seg_result)[i + 1])) && + std::get<1>((*seg_result)[i - 1]) == "v") { + std::get<0>((*result_temp)[i - 1]) = + std::get<0>((*result_temp)[i - 1]) + "一" + + std::get<0>((*result_temp)[i - 1]); + } else { + if ((i - 2 >= 0) && (std::get<0>((*seg_result)[i - 1]) == "一") && + (std::get<0>((*seg_result)[i - 2]) == word) && (pos == "v")) { + continue; + } else { + result_temp->push_back(make_pair(word, pos)); + } + } + } + + // function 2 example: 一 你 --> 一你 + std::vector> result = {}; + for (int j = 0; j < result_temp->size(); j++) { + word = std::get<0>((*result_temp)[j]); + pos = std::get<1>((*result_temp)[j]); + if ((result.size() != 0) && (result.back().first == "一")) { + result.back().first = result.back().first + word; + } else { + result.push_back(make_pair(word, pos)); + } + } + + return result; +} + +// example: 你 你 --> 你你 +std::vector> +FrontEngineInterface::MergeReduplication( + std::vector> *seg_result) { + std::vector> result; + std::string word; + std::string pos; + + for (int i = 0; i < seg_result->size(); i++) { + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + if ((result.size() != 0) && (word == result.back().first)) { + result.back().first = + result.back().first + std::get<0>((*seg_result)[i]); + } else { + result.push_back(make_pair(word, pos)); + } + } + + return result; +} + +// the first and the second words are all_tone_three +std::vector> +FrontEngineInterface::MergeThreeTones( + std::vector> *seg_result) { + std::vector> result; + std::string word; + std::string pos; + std::vector> finals; //韵母数组 + std::vector word_final; + std::vector merge_last(seg_result->size(), false); + + // 判断最后一个分词结果是不是标点,不看标点的声母韵母 + int word_num = seg_result->size() - 1; + + // seg_result[word_num].first + if (std::find( + _punc.begin(), _punc.end(), std::get<0>((*seg_result)[word_num])) == + _punc.end()) { // 最后一个分词结果不是标点 + word_num += 1; + } + + // 获取韵母数组 + for (int i = 0; i < word_num; i++) { + word_final = {}; + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + if (std::find(_punc_omit.begin(), _punc_omit.end(), word) == + _punc_omit.end()) { // 非可忽略的标点,即文字 + if (0 != GetFinals(word, &word_final)) { + LOG(ERROR) << "Failed to get the final of word."; + } + } + + finals.push_back(word_final); + } + assert(word_num == finals.size()); + + // 对第三声读音的字词分词结果进行处理 + for (int i = 0; i < word_num; i++) { + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + if (i - 1 >= 0 && AllToneThree(finals[i - 1]) && + AllToneThree(finals[i]) && !merge_last[i - 1]) { + // if the last word is reduplication, not merge, because + // reduplication need to be _neural_sandhi + // seg_result[i - 1].first + if (!IsReduplication(std::get<0>((*seg_result)[i - 1])) && + (ppspeech::utf8string2wstring( + std::get<0>((*seg_result)[i - 1]))) + .length() + + (ppspeech::utf8string2wstring(word)).length() <= + 3) { + result.back().first = + result.back().first + std::get<0>((*seg_result)[i]); + merge_last[i] = true; + } else { + result.push_back(make_pair(word, pos)); + } + } else { + result.push_back(make_pair(word, pos)); + } + } + + //把标点的分词结果补上 + if (word_num < seg_result->size()) { + result.push_back( + // seg_result[word_num].first seg_result[word_num].second + // std::get<0>((*seg_result)[word_num]) + make_pair(std::get<0>((*seg_result)[word_num]), + std::get<1>((*seg_result)[word_num]))); + } + + return result; +} + +// the last char of first word and the first char of second word is tone_three +std::vector> +FrontEngineInterface::MergeThreeTones2( + std::vector> *seg_result) { + std::vector> result; + std::string word; + std::string pos; + std::vector> finals; //韵母数组 + std::vector word_final; + std::vector merge_last(seg_result->size(), false); + + // 判断最后一个分词结果是不是标点 + int word_num = seg_result->size() - 1; + if (std::find( + _punc.begin(), _punc.end(), std::get<0>((*seg_result)[word_num])) == + _punc.end()) { // 最后一个分词结果不是标点 + word_num += 1; + } + + // 获取韵母数组 + for (int i = 0; i < word_num; i++) { + word_final = {}; + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + // 如果是文字,则获取韵母,如果是可忽略的标点,例如引号,则跳过 + if (std::find(_punc_omit.begin(), _punc_omit.end(), word) == + _punc_omit.end()) { + if (0 != GetFinals(word, &word_final)) { + LOG(ERROR) << "Failed to get the final of word."; + } + } + + finals.push_back(word_final); + } + assert(word_num == finals.size()); + + // 对第三声读音的字词分词结果进行处理 + for (int i = 0; i < word_num; i++) { + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + if (i - 1 >= 0 && !finals[i - 1].empty() && + absl::EndsWith(finals[i - 1].back(), "3") == true && + !finals[i].empty() && + absl::EndsWith(finals[i].front(), "3") == true && + !merge_last[i - 1]) { + // if the last word is reduplication, not merge, because + // reduplication need to be _neural_sandhi + // seg_result[i - 1].first + if (!IsReduplication(std::get<0>((*seg_result)[i - 1])) && + (ppspeech::utf8string2wstring( + std::get<0>((*seg_result)[i - 1]))) + .length() + + ppspeech::utf8string2wstring(word).length() <= + 3) { + result.back().first = + result.back().first + std::get<0>((*seg_result)[i]); + merge_last[i] = true; + } else { + result.push_back(make_pair(word, pos)); + } + } else { + result.push_back(make_pair(word, pos)); + } + } + + //把标点的分词结果补上 + if (word_num < seg_result->size()) { + result.push_back(make_pair(std::get<0>((*seg_result)[word_num]), + std::get<1>((*seg_result)[word_num]))); + } + + return result; +} + +// example: 吃饭 儿 --> 吃饭儿 +std::vector> FrontEngineInterface::MergeEr( + std::vector> *seg_result) { + std::vector> result; + std::string word; + std::string pos; + + for (int i = 0; i < seg_result->size(); i++) { + word = std::get<0>((*seg_result)[i]); + pos = std::get<1>((*seg_result)[i]); + if ((i - 1 >= 0) && (word == "儿")) { + result.back().first = + result.back().first + std::get<0>((*seg_result)[i]); + } else { + result.push_back(make_pair(word, pos)); + } + } + + return result; +} + +int FrontEngineInterface::MergeforModify( + std::vector> *seg_word_type, + std::vector> *modify_seg_word_type) { + std::vector seg_result; + GetSegResult(seg_word_type, &seg_result); + LOG(INFO) << "Before merge, seg result is: " + << limonp::Join(seg_result.begin(), seg_result.end(), "/"); + std::vector> tmp; + tmp = MergeBu(seg_word_type); + *modify_seg_word_type = tmp; + tmp = Mergeyi(modify_seg_word_type); + *modify_seg_word_type = tmp; + tmp = MergeReduplication(modify_seg_word_type); + *modify_seg_word_type = tmp; + tmp = MergeThreeTones(modify_seg_word_type); + *modify_seg_word_type = tmp; + tmp = MergeThreeTones2(modify_seg_word_type); + *modify_seg_word_type = tmp; + tmp = MergeEr(modify_seg_word_type); + *modify_seg_word_type = tmp; + seg_result = {}; + + GetSegResult(modify_seg_word_type, &seg_result); + LOG(INFO) << "After merge, seg result is: " + << limonp::Join(seg_result.begin(), seg_result.end(), "/"); + + return 0; +} + + +int FrontEngineInterface::BuSandi(const std::string &word, + std::vector *finals) { + std::wstring bu = L"不"; + std::vector wordvec; + // 一个词转成向量形式 + if (0 != Word2WordVec(word, &wordvec)) { + LOG(ERROR) << "Failed to get word vector"; + return -1; + } + + // e.g. 看不懂 b u4 --> b u5, 将韵母的最后一位替换成 5 + if (wordvec.size() == 3 && wordvec[1] == bu) { + (*finals)[1] = (*finals)[1].replace((*finals)[1].length() - 1, 1, "5"); + } else { + // e.g. 不怕 b u4 --> b u2, 将韵母的最后一位替换成 2 + for (int i = 0; i < wordvec.size(); i++) { + if (wordvec[i] == bu && i + 1 < wordvec.size() && + absl::EndsWith((*finals)[i + 1], "4") == true) { + (*finals)[i] = + (*finals)[i].replace((*finals)[i].length() - 1, 1, "2"); + } + } + } + + return 0; +} + + +int FrontEngineInterface::YiSandhi(const std::string &word, + std::vector *finals) { + std::wstring yi = L"一"; + std::vector wordvec; + // 一个词转成向量形式 + if (0 != Word2WordVec(word, &wordvec)) { + LOG(ERROR) << "Failed to get word vector"; + return -1; + } + + //情况1:"一" in number sequences, e.g. 一零零, 二一零 + std::wstring num_wstr = L"零一二三四六七八九"; + std::wstring word_wstr = ppspeech::utf8string2wstring(word); + if (word_wstr.find(yi) != word_wstr.npos && wordvec.back() != yi) { + int flags = 0; + for (int j = 0; j < wordvec.size(); j++) { + if (num_wstr.find(wordvec[j]) == num_wstr.npos) { + flags = -1; + break; + } + } + if (flags == 0) { + return 0; + } + } else if (wordvec.size() == 3 && wordvec[1] == yi && + wordvec[0] == wordvec[2]) { + // "一" between reduplication words shold be yi5, e.g. 看一看 + (*finals)[1] = (*finals)[1].replace((*finals)[1].length() - 1, 1, "5"); + } else if (wordvec[0] == L"第" && wordvec[1] == yi) { //以第一位开始 + (*finals)[1] = (*finals)[1].replace((*finals)[1].length() - 1, 1, "1"); + } else { + for (int i = 0; i < wordvec.size(); i++) { + if (wordvec[i] == yi && i + 1 < wordvec.size()) { + if (absl::EndsWith((*finals)[i + 1], "4") == true) { + // "一" before tone4 should be yi2, e.g. 一段 + (*finals)[i] = + (*finals)[i].replace((*finals)[i].length() - 1, 1, "2"); + } else { + // "一" before non-tone4 should be yi4, e.g. 一天 + (*finals)[i] = + (*finals)[i].replace((*finals)[i].length() - 1, 1, "4"); + } + } + } + } + + return 0; +} + +int FrontEngineInterface::NeuralSandhi(const std::string &word, + const std::string &pos, + std::vector *finals) { + std::wstring word_wstr = ppspeech::utf8string2wstring(word); + std::vector wordvec; + // 一个词转成向量形式 + if (0 != Word2WordVec(word, &wordvec)) { + LOG(ERROR) << "Failed to get word vector"; + return -1; + } + int word_num = wordvec.size(); + assert(word_num == word_wstr.length()); + + // 情况1:reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 + for (int j = 0; j < wordvec.size(); j++) { + std::string inits = "nva"; + if (j - 1 >= 0 && wordvec[j] == wordvec[j - 1] && + inits.find(pos[0]) != inits.npos) { + (*finals)[j] = + (*finals)[j].replace((*finals)[j].length() - 1, 1, "5"); + } + } + + // 情况2:对下述词的处理 + std::wstring yuqici = L"吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶"; + std::wstring de = L"的地得"; + std::wstring le = L"了着过"; + std::vector le_pos = {"ul", "uz", "ug"}; + std::wstring men = L"们子"; + std::vector men_pos = {"r", "n"}; + std::wstring weizhi = L"上下里"; + std::vector weizhi_pos = {"s", "l", "f"}; + std::wstring dong = L"来去"; + std::wstring fangxiang = L"上下进出回过起开"; + std::wstring ge = L"个"; + std::wstring xiushi = L"几有两半多各整每做是零一二三四六七八九"; + auto ge_idx = word_wstr.find_first_of(ge); // 出现“个”的第一个位置 + + if (word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else if (word_num >= 1 && de.find(wordvec.back()) != de.npos) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else if (word_num == 1 && le.find(wordvec[0]) != le.npos && + find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else if (word_num > 1 && men.find(wordvec.back()) != men.npos && + find(men_pos.begin(), men_pos.end(), pos) != men_pos.end() && + find(must_not_neural_tone_words.begin(), + must_not_neural_tone_words.end(), + word) != must_not_neural_tone_words.end()) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else if (word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos && + find(weizhi_pos.begin(), weizhi_pos.end(), pos) != + weizhi_pos.end()) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else if (word_num > 1 && dong.find(wordvec.back()) != dong.npos && + fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else if ((ge_idx != word_wstr.npos && ge_idx >= 1 && + xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos) || + word_wstr == ge) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } else { + if (find(must_neural_tone_words.begin(), + must_neural_tone_words.end(), + word) != must_neural_tone_words.end() || + (word_num >= 2 && + find(must_neural_tone_words.begin(), + must_neural_tone_words.end(), + ppspeech::wstring2utf8string(word_wstr.substr( + word_num - 2))) != must_neural_tone_words.end())) { + (*finals).back() = + (*finals).back().replace((*finals).back().length() - 1, 1, "5"); + } + } + + // 进行进一步分词,把长词切分更短些 + std::vector word_list; + if (0 != SplitWord(word, &word_list)) { + LOG(ERROR) << "Failed to split word."; + return -1; + } + // 创建对应的 韵母列表 + std::vector> finals_list; + std::vector finals_temp; + finals_temp.assign((*finals).begin(), + (*finals).begin() + + ppspeech::utf8string2wstring(word_list[0]).length()); + finals_list.push_back(finals_temp); + finals_temp.assign( + (*finals).begin() + ppspeech::utf8string2wstring(word_list[0]).length(), + (*finals).end()); + finals_list.push_back(finals_temp); + + finals = new std::vector(); + for (int i = 0; i < word_list.size(); i++) { + std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]); + if ((find(must_neural_tone_words.begin(), + must_neural_tone_words.end(), + word_list[i]) != must_neural_tone_words.end()) || + (temp_wstr.length() >= 2 && + find(must_neural_tone_words.begin(), + must_neural_tone_words.end(), + ppspeech::wstring2utf8string( + temp_wstr.substr(temp_wstr.length() - 2))) != + must_neural_tone_words.end())) { + finals_list[i].back() = finals_list[i].back().replace( + finals_list[i].back().length() - 1, 1, "5"); + } + (*finals).insert( + (*finals).end(), finals_list[i].begin(), finals_list[i].end()); + } + + return 0; +} + +int FrontEngineInterface::ThreeSandhi(const std::string &word, + std::vector *finals) { + std::wstring word_wstr = ppspeech::utf8string2wstring(word); + std::vector> finals_list; + std::vector finals_temp; + std::vector wordvec; + // 一个词转成向量形式 + if (0 != Word2WordVec(word, &wordvec)) { + LOG(ERROR) << "Failed to get word vector"; + return -1; + } + int word_num = wordvec.size(); + assert(word_num == word_wstr.length()); + + if (word_num == 2 && AllToneThree((*finals))) { + (*finals)[0] = (*finals)[0].replace((*finals)[0].length() - 1, 1, "2"); + } else if (word_num == 3) { + // 进行进一步分词,把长词切分更短些 + std::vector word_list; + if (0 != SplitWord(word, &word_list)) { + LOG(ERROR) << "Failed to split word."; + return -1; + } + if (AllToneThree((*finals))) { + std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[0]); + // disyllabic + monosyllabic, e.g. 蒙古/包 + if (temp_wstr.length() == 2) { + (*finals)[0] = + (*finals)[0].replace((*finals)[0].length() - 1, 1, "2"); + (*finals)[1] = + (*finals)[1].replace((*finals)[1].length() - 1, 1, "2"); + } else if (temp_wstr.length() == + 1) { // monosyllabic + disyllabic, e.g. 纸/老虎 + (*finals)[1] = + (*finals)[1].replace((*finals)[1].length() - 1, 1, "2"); + } + } else { + // 创建对应的 韵母列表 + finals_temp = {}; + finals_list = {}; + finals_temp.assign( + (*finals).begin(), + (*finals).begin() + + ppspeech::utf8string2wstring(word_list[0]).length()); + finals_list.push_back(finals_temp); + finals_temp.assign( + (*finals).begin() + + ppspeech::utf8string2wstring(word_list[0]).length(), + (*finals).end()); + finals_list.push_back(finals_temp); + + finals = new std::vector(); + for (int i = 0; i < finals_list.size(); i++) { + // e.g. 所有/人 + if (AllToneThree(finals_list[i]) && + finals_list[i].size() == 2) { + finals_list[i][0] = finals_list[i][0].replace( + finals_list[i][0].length() - 1, 1, "2"); + } else if (i == 1 && !(AllToneThree(finals_list[i])) && + absl::EndsWith(finals_list[i][0], "3") == true && + absl::EndsWith(finals_list[0].back(), "3") == true) { + finals_list[0].back() = finals_list[0].back().replace( + finals_list[0].back().length() - 1, 1, "2"); + } + } + (*finals).insert( + (*finals).end(), finals_list[0].begin(), finals_list[0].end()); + (*finals).insert( + (*finals).end(), finals_list[1].begin(), finals_list[1].end()); + } + + } else if (word_num == 4) { //将成语拆分为两个长度为 2 的单词 + // 创建对应的 韵母列表 + finals_temp = {}; + finals_list = {}; + finals_temp.assign((*finals).begin(), (*finals).begin() + 2); + finals_list.push_back(finals_temp); + finals_temp.assign((*finals).begin() + 2, (*finals).end()); + finals_list.push_back(finals_temp); + + finals = new std::vector(); + for (int j = 0; j < finals_list.size(); j++) { + if (AllToneThree(finals_list[j])) { + finals_list[j][0] = finals_list[j][0].replace( + finals_list[j][0].length() - 1, 1, "2"); + } + (*finals).insert( + (*finals).end(), finals_list[j].begin(), finals_list[j].end()); + } + } + + return 0; +} + +int FrontEngineInterface::ModifyTone(const std::string &word, + const std::string &pos, + std::vector *finals) { + if ((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) || + (0 != NeuralSandhi(word, pos, finals)) || + (0 != ThreeSandhi(word, finals))) { + LOG(ERROR) << "Failed to modify tone of the word: " << word; + return -1; + } + + return 0; +} + +std::vector> FrontEngineInterface::MergeErhua( + const std::vector &initials, + const std::vector &finals, + const std::string &word, + const std::string &pos) { + std::vector new_initials = {}; + std::vector new_finals = {}; + std::vector> new_initials_finals; + std::vector specified_pos = {"a", "j", "nr"}; + std::wstring word_wstr = ppspeech::utf8string2wstring(word); + std::vector wordvec; + // 一个词转成向量形式 + if (0 != Word2WordVec(word, &wordvec)) { + LOG(ERROR) << "Failed to get word vector"; + } + int word_num = wordvec.size(); + + if ((find(must_erhua.begin(), must_erhua.end(), word) == + must_erhua.end()) && + ((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) || + (find(specified_pos.begin(), specified_pos.end(), pos) != + specified_pos.end()))) { + new_initials_finals.push_back(initials); + new_initials_finals.push_back(finals); + return new_initials_finals; + } + if (finals.size() != word_num) { + new_initials_finals.push_back(initials); + new_initials_finals.push_back(finals); + return new_initials_finals; + } + + assert(finals.size() == word_num); + for (int i = 0; i < finals.size(); i++) { + if (i == finals.size() - 1 && wordvec[i] == L"儿" && + (finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 && + find(not_erhua.begin(), + not_erhua.end(), + ppspeech::wstring2utf8string(word_wstr.substr( + word_wstr.length() - 2))) == not_erhua.end() && + !new_finals.empty()) { + new_finals.back() = + new_finals.back().substr(0, new_finals.back().length() - 1) + + "r" + new_finals.back().substr(new_finals.back().length() - 1); + } else { + new_initials.push_back(initials[i]); + new_finals.push_back(finals[i]); + } + } + new_initials_finals.push_back(new_initials); + new_initials_finals.push_back(new_finals); + + return new_initials_finals; +} +} // namespace ppspeech diff --git a/demos/TTSCppFrontend/src/front/front_interface.h b/demos/TTSCppFrontend/src/front/front_interface.h new file mode 100644 index 000000000..fc33a4de6 --- /dev/null +++ b/demos/TTSCppFrontend/src/front/front_interface.h @@ -0,0 +1,198 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H +#define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H + +#include +#include +#include +#include +#include +//#include "utils/dir_utils.h" +#include +#include "absl/strings/str_split.h" +#include "front/text_normalize.h" + + +namespace ppspeech { + +class FrontEngineInterface : public TextNormalizer { + public: + explicit FrontEngineInterface(std::string conf) : _conf_file(conf) { + TextNormalizer(); + _jieba = nullptr; + _initialed = false; + init(); + } + + int init(); + ~FrontEngineInterface() {} + + // 读取配置文件 + int ReadConfFile(); + + // 简体转繁体 + int Trand2Simp(const std::wstring &sentence, std::wstring *sentence_simp); + + // 生成字典 + int GenDict(const std::string &file, + std::map *map); + + // 由 词+词性的分词结果转为仅包含词的结果 + int GetSegResult(std::vector> *seg, + std::vector *seg_words); + + // 生成句子的音素,音调id。如果音素和音调未分开,则 toneids + // 为空(fastspeech2),反之则不为空(speedyspeech) + int GetSentenceIds(const std::string &sentence, + std::vector *phoneids, + std::vector *toneids); + + // 根据分词结果获取词的音素,音调id,并对读音进行适当修改 + // (ModifyTone)。如果音素和音调未分开,则 toneids + // 为空(fastspeech2),反之则不为空(speedyspeech) + int GetWordsIds( + const std::vector> &cut_result, + std::vector *phoneids, + std::vector *toneids); + + // 结巴分词生成包含词和词性的分词结果,再对分词结果进行适当修改 + // (MergeforModify) + int Cut(const std::string &sentence, + std::vector> *cut_result); + + // 字词到音素的映射,查找字典 + int GetPhone(const std::string &word, std::string *phone); + + // 音素到音素id + int Phone2Phoneid(const std::string &phone, + std::vector *phoneid, + std::vector *toneids); + + + // 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声 + bool AllToneThree(const std::vector &finals); + + // 判断词是否是叠词 + bool IsReduplication(const std::string &word); + + // 获取每个字词的声母韵母列表 + int GetInitialsFinals(const std::string &word, + std::vector *word_initials, + std::vector *word_finals); + + // 获取每个字词的韵母列表 + int GetFinals(const std::string &word, + std::vector *word_finals); + + // 整个词转成向量形式,向量的每个元素对应词的一个字 + int Word2WordVec(const std::string &word, + std::vector *wordvec); + + // 将整个词重新进行 full cut,分词后,各个词会在词典中 + int SplitWord(const std::string &word, + std::vector *fullcut_word); + + // 对分词结果进行处理:对包含“不”字的分词结果进行整理 + std::vector> MergeBu( + std::vector> *seg_result); + + // 对分词结果进行处理:对包含“一”字的分词结果进行整理 + std::vector> Mergeyi( + std::vector> *seg_result); + + // 对分词结果进行处理:对前后相同的两个字进行合并 + std::vector> MergeReduplication( + std::vector> *seg_result); + + // 对一个词和后一个词他们的读音均为第三声的两个词进行合并 + std::vector> MergeThreeTones( + std::vector> *seg_result); + + // 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并 + std::vector> MergeThreeTones2( + std::vector> *seg_result); + + // 对分词结果进行处理:对包含“儿”字的分词结果进行整理 + std::vector> MergeEr( + std::vector> *seg_result); + + // 对分词结果进行处理、修改 + int MergeforModify( + std::vector> *seg_result, + std::vector> *merge_seg_result); + + + // 对包含“不”字的相关词音调进行修改 + int BuSandi(const std::string &word, std::vector *finals); + + // 对包含“一”字的相关词音调进行修改 + int YiSandhi(const std::string &word, std::vector *finals); + + // 对一些特殊词(包括量词,语助词等)的相关词音调进行修改 + int NeuralSandhi(const std::string &word, + const std::string &pos, + std::vector *finals); + + // 对包含第三声的相关词音调进行修改 + int ThreeSandhi(const std::string &word, std::vector *finals); + + // 对字词音调进行处理、修改 + int ModifyTone(const std::string &word, + const std::string &pos, + std::vector *finals); + + + // 对儿化音进行处理 + std::vector> MergeErhua( + const std::vector &initials, + const std::vector &finals, + const std::string &word, + const std::string &pos); + + + private: + bool _initialed; + cppjieba::Jieba *_jieba; + std::vector _punc; + std::vector _punc_omit; + + std::string _conf_file; + std::map conf_map; + std::map word_phone_map; + std::map phone_id_map; + std::map tone_id_map; + std::map trand_simp_map; + + + std::string _jieba_dict_path; + std::string _jieba_hmm_path; + std::string _jieba_user_dict_path; + std::string _jieba_idf_path; + std::string _jieba_stop_word_path; + + std::string _seperate_tone; + std::string _word2phone_path; + std::string _phone2id_path; + std::string _tone2id_path; + std::string _trand2simp_path; + + std::vector must_erhua; + std::vector not_erhua; + + std::vector must_not_neural_tone_words; + std::vector must_neural_tone_words; +}; +} // namespace ppspeech +#endif \ No newline at end of file diff --git a/demos/TTSCppFrontend/src/front/text_normalize.cpp b/demos/TTSCppFrontend/src/front/text_normalize.cpp new file mode 100644 index 000000000..8420e8407 --- /dev/null +++ b/demos/TTSCppFrontend/src/front/text_normalize.cpp @@ -0,0 +1,542 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "front/text_normalize.h" + +namespace ppspeech { + +// 初始化 digits_map and unit_map +int TextNormalizer::InitMap() { + digits_map["0"] = "零"; + digits_map["1"] = "一"; + digits_map["2"] = "二"; + digits_map["3"] = "三"; + digits_map["4"] = "四"; + digits_map["5"] = "五"; + digits_map["6"] = "六"; + digits_map["7"] = "七"; + digits_map["8"] = "八"; + digits_map["9"] = "九"; + + units_map[1] = "十"; + units_map[2] = "百"; + units_map[3] = "千"; + units_map[4] = "万"; + units_map[8] = "亿"; + + return 0; +} + +// 替换 +int TextNormalizer::Replace(std::wstring *sentence, + const int &pos, + const int &len, + const std::wstring &repstr) { + // 删除原来的 + sentence->erase(pos, len); + // 插入新的 + sentence->insert(pos, repstr); + return 0; +} + +// 根据标点符号切分句子 +int TextNormalizer::SplitByPunc(const std::wstring &sentence, + std::vector *sentence_part) { + std::wstring temp = sentence; + std::wregex reg(L"[:,;。?!,;?!]"); + std::wsmatch match; + + while (std::regex_search(temp, match, reg)) { + sentence_part->push_back( + temp.substr(0, match.position(0) + match.length(0))); + Replace(&temp, 0, match.position(0) + match.length(0), L""); + } + // 如果最后没有标点符号 + if (temp != L"") { + sentence_part->push_back(temp); + } + return 0; +} + +// 数字转文本,10200 - > 一万零二百 +std::string TextNormalizer::CreateTextValue(const std::string &num_str, + bool use_zero) { + std::string num_lstrip = + std::string(absl::StripPrefix(num_str, "0")).data(); + int len = num_lstrip.length(); + + if (len == 0) { + return ""; + } else if (len == 1) { + if (use_zero && (len < num_str.length())) { + return digits_map["0"] + digits_map[num_lstrip]; + } else { + return digits_map[num_lstrip]; + } + } else { + int largest_unit = 0; // 最大单位 + std::string first_part; + std::string second_part; + + if (len > 1 && len <= 2) { + largest_unit = 1; + } else if (len > 2 && len <= 3) { + largest_unit = 2; + } else if (len > 3 && len <= 4) { + largest_unit = 3; + } else if (len > 4 && len <= 8) { + largest_unit = 4; + } else if (len > 8) { + largest_unit = 8; + } + + first_part = num_str.substr(0, num_str.length() - largest_unit); + second_part = num_str.substr(num_str.length() - largest_unit); + + return CreateTextValue(first_part, use_zero) + units_map[largest_unit] + + CreateTextValue(second_part, use_zero); + } +} + +// 数字一个一个对应,可直接用于年份,电话,手机, +std::string TextNormalizer::SingleDigit2Text(const std::string &num_str, + bool alt_one) { + std::string text = ""; + if (alt_one) { + digits_map["1"] = "幺"; + } else { + digits_map["1"] = "一"; + } + + for (size_t i = 0; i < num_str.size(); i++) { + std::string num_int(1, num_str[i]); + if (digits_map.find(num_int) == digits_map.end()) { + LOG(ERROR) << "digits_map doesn't have key: " << num_int; + } + text += digits_map[num_int]; + } + + return text; +} + +std::string TextNormalizer::SingleDigit2Text(const std::wstring &num, + bool alt_one) { + std::string num_str = wstring2utf8string(num); + return SingleDigit2Text(num_str, alt_one); +} + +// 数字整体对应,可直接用于月份,日期,数值整数部分 +std::string TextNormalizer::MultiDigit2Text(const std::string &num_str, + bool alt_one, + bool use_zero) { + LOG(INFO) << "aaaaaaaaaaaaaaaa: " << alt_one << use_zero; + if (alt_one) { + digits_map["1"] = "幺"; + } else { + digits_map["1"] = "一"; + } + + std::wstring result = + utf8string2wstring(CreateTextValue(num_str, use_zero)); + std::wstring result_0(1, result[0]); + std::wstring result_1(1, result[1]); + // 一十八 --> 十八 + if ((result_0 == utf8string2wstring(digits_map["1"])) && + (result_1 == utf8string2wstring(units_map[1]))) { + return wstring2utf8string(result.substr(1, result.length())); + } else { + return wstring2utf8string(result); + } +} + +std::string TextNormalizer::MultiDigit2Text(const std::wstring &num, + bool alt_one, + bool use_zero) { + std::string num_str = wstring2utf8string(num); + return MultiDigit2Text(num_str, alt_one, use_zero); +} + +// 数字转文本,包括整数和小数 +std::string TextNormalizer::Digits2Text(const std::string &num_str) { + std::string text; + std::vector integer_decimal; + integer_decimal = absl::StrSplit(num_str, "."); + + if (integer_decimal.size() == 1) { // 整数 + text = MultiDigit2Text(integer_decimal[0]); + } else if (integer_decimal.size() == 2) { // 小数 + if (integer_decimal[0] == "") { // 无整数的小数类型,例如:.22 + text = "点" + + SingleDigit2Text( + std::string(absl::StripSuffix(integer_decimal[1], "0")) + .data()); + } else { // 常规小数类型,例如:12.34 + text = MultiDigit2Text(integer_decimal[0]) + "点" + + SingleDigit2Text( + std::string(absl::StripSuffix(integer_decimal[1], "0")) + .data()); + } + } else { + return "The value does not conform to the numeric format"; + } + + return text; +} + +std::string TextNormalizer::Digits2Text(const std::wstring &num) { + std::string num_str = wstring2utf8string(num); + return Digits2Text(num_str); +} + +// 日期,2021年8月18日 --> 二零二一年八月十八日 +int TextNormalizer::ReData(std::wstring *sentence) { + std::wregex reg( + L"(\\d{4}|\\d{2})年((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)" + L"([日号]))?"); + std::wsmatch match; + std::string rep; + + while (std::regex_search(*sentence, match, reg)) { + rep = ""; + rep += SingleDigit2Text(match[1]) + "年"; + if (match[3] != L"") { + rep += MultiDigit2Text(match[3], false, false) + "月"; + } + if (match[5] != L"") { + rep += MultiDigit2Text(match[5], false, false) + + wstring2utf8string(match[9]); + } + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + + +// XX-XX-XX or XX/XX/XX 例如:2021/08/18 --> 二零二一年八月十八日 +int TextNormalizer::ReData2(std::wstring *sentence) { + std::wregex reg( + L"(\\d{4})([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])"); + std::wsmatch match; + std::string rep; + + while (std::regex_search(*sentence, match, reg)) { + rep = ""; + rep += (SingleDigit2Text(match[1]) + "年"); + rep += (MultiDigit2Text(match[3], false, false) + "月"); + rep += (MultiDigit2Text(match[4], false, false) + "日"); + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// XX:XX:XX 09:09:02 --> 九点零九分零二秒 +int TextNormalizer::ReTime(std::wstring *sentence) { + std::wregex reg(L"([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?"); + std::wsmatch match; + std::string rep; + + while (std::regex_search(*sentence, match, reg)) { + rep = ""; + rep += (MultiDigit2Text(match[1], false, false) + "点"); + if (absl::StartsWith(wstring2utf8string(match[2]), "0")) { + rep += "零"; + } + rep += (MultiDigit2Text(match[2]) + "分"); + if (absl::StartsWith(wstring2utf8string(match[4]), "0")) { + rep += "零"; + } + rep += (MultiDigit2Text(match[4]) + "秒"); + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 温度,例如:-24.3℃ --> 零下二十四点三度 +int TextNormalizer::ReTemperature(std::wstring *sentence) { + std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)(°C|℃|度|摄氏度)"); + std::wsmatch match; + std::string rep; + std::string sign; + std::vector integer_decimal; + std::string unit; + + while (std::regex_search(*sentence, match, reg)) { + match[1] == L"-" ? sign = "负" : sign = ""; + match[4] == L"摄氏度" ? unit = "摄氏度" : unit = "度"; + rep = sign + Digits2Text(match[2]) + unit; + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 分数,例如: 1/3 --> 三分之一 +int TextNormalizer::ReFrac(std::wstring *sentence) { + std::wregex reg(L"(-?)(\\d+)/(\\d+)"); + std::wsmatch match; + std::string sign; + std::string rep; + while (std::regex_search(*sentence, match, reg)) { + match[1] == L"-" ? sign = "负" : sign = ""; + rep = sign + MultiDigit2Text(match[3]) + "分之" + + MultiDigit2Text(match[2]); + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 百分数,例如:45.5% --> 百分之四十五点五 +int TextNormalizer::RePercentage(std::wstring *sentence) { + std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)%"); + std::wsmatch match; + std::string sign; + std::string rep; + std::vector integer_decimal; + + while (std::regex_search(*sentence, match, reg)) { + match[1] == L"-" ? sign = "负" : sign = ""; + rep = sign + "百分之" + Digits2Text(match[2]); + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 手机号码,例如:+86 18883862235 --> 八六幺八八八三八六二二三五 +int TextNormalizer::ReMobilePhone(std::wstring *sentence) { + std::wregex reg( + L"(\\d)?((\\+?86 ?)?1([38]\\d|5[0-35-9]|7[678]|9[89])\\d{8})(\\d)?"); + std::wsmatch match; + std::string rep; + std::vector country_phonenum; + + while (std::regex_search(*sentence, match, reg)) { + country_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "+"); + rep = ""; + for (int i = 0; i < country_phonenum.size(); i++) { + LOG(INFO) << country_phonenum[i]; + rep += SingleDigit2Text(country_phonenum[i], true); + } + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 座机号码,例如:010-51093154 --> 零幺零五幺零九三幺五四 +int TextNormalizer::RePhone(std::wstring *sentence) { + std::wregex reg( + L"(\\d)?((0(10|2[1-3]|[3-9]\\d{2})-?)?[1-9]\\d{6,7})(\\d)?"); + std::wsmatch match; + std::vector zone_phonenum; + std::string rep; + + while (std::regex_search(*sentence, match, reg)) { + rep = ""; + zone_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "-"); + for (int i = 0; i < zone_phonenum.size(); i++) { + rep += SingleDigit2Text(zone_phonenum[i], true); + } + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 范围,例如:60~90 --> 六十到九十 +int TextNormalizer::ReRange(std::wstring *sentence) { + std::wregex reg( + L"((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))[-~]((-?)((\\d+)(\\.\\d+)?)|(\\.(" + L"\\d+)))"); + std::wsmatch match; + std::string rep; + std::string sign1; + std::string sign2; + + while (std::regex_search(*sentence, match, reg)) { + rep = ""; + match[2] == L"-" ? sign1 = "负" : sign1 = ""; + if (match[6] != L"") { + rep += sign1 + Digits2Text(match[6]) + "到"; + } else { + rep += sign1 + Digits2Text(match[3]) + "到"; + } + match[9] == L"-" ? sign2 = "负" : sign2 = ""; + if (match[13] != L"") { + rep += sign2 + Digits2Text(match[13]); + } else { + rep += sign2 + Digits2Text(match[10]); + } + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 带负号的整数,例如:-10 --> 负十 +int TextNormalizer::ReInterger(std::wstring *sentence) { + std::wregex reg(L"(-)(\\d+)"); + std::wsmatch match; + std::string rep; + while (std::regex_search(*sentence, match, reg)) { + rep = "负" + MultiDigit2Text(match[2]); + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 纯小数 +int TextNormalizer::ReDecimalNum(std::wstring *sentence) { + std::wregex reg(L"(-?)((\\d+)(\\.\\d+))|(\\.(\\d+))"); + std::wsmatch match; + std::string sign; + std::string rep; + // std::vector integer_decimal; + while (std::regex_search(*sentence, match, reg)) { + match[1] == L"-" ? sign = "负" : sign = ""; + if (match[5] != L"") { + rep = sign + Digits2Text(match[5]); + } else { + rep = sign + Digits2Text(match[2]); + } + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + + return 0; +} + +// 正整数 + 量词 +int TextNormalizer::RePositiveQuantifiers(std::wstring *sentence) { + std::wstring common_quantifiers = + L"(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|" + L"担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|" + L"溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|" + L"本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|" + L"毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|" + L"合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|" + L"卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|" + L"夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|" + L"元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|" + L"百万|万|千|百|)块|角|毛|分)"; + std::wregex reg(L"(\\d+)([多余几])?" + common_quantifiers); + std::wsmatch match; + std::string rep; + while (std::regex_search(*sentence, match, reg)) { + rep = MultiDigit2Text(match[1]); + Replace(sentence, + match.position(1), + match.length(1), + utf8string2wstring(rep)); + } + + return 0; +} + +// 编号类数字,例如: 89757 --> 八九七五七 +int TextNormalizer::ReDefalutNum(std::wstring *sentence) { + std::wregex reg(L"\\d{3}\\d*"); + std::wsmatch match; + while (std::regex_search(*sentence, match, reg)) { + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(SingleDigit2Text(match[0]))); + } + + return 0; +} + +int TextNormalizer::ReNumber(std::wstring *sentence) { + std::wregex reg(L"(-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+))"); + std::wsmatch match; + std::string sign; + std::string rep; + while (std::regex_search(*sentence, match, reg)) { + match[1] == L"-" ? sign = "负" : sign = ""; + if (match[5] != L"") { + rep = sign + Digits2Text(match[5]); + } else { + rep = sign + Digits2Text(match[2]); + } + + Replace(sentence, + match.position(0), + match.length(0), + utf8string2wstring(rep)); + } + return 0; +} + +// 整体正则,按顺序 +int TextNormalizer::SentenceNormalize(std::wstring *sentence) { + ReData(sentence); + ReData2(sentence); + ReTime(sentence); + ReTemperature(sentence); + ReFrac(sentence); + RePercentage(sentence); + ReMobilePhone(sentence); + RePhone(sentence); + ReRange(sentence); + ReInterger(sentence); + ReDecimalNum(sentence); + RePositiveQuantifiers(sentence); + ReDefalutNum(sentence); + ReNumber(sentence); + return 0; +} +} // namespace ppspeech \ No newline at end of file diff --git a/demos/TTSCppFrontend/src/front/text_normalize.h b/demos/TTSCppFrontend/src/front/text_normalize.h new file mode 100644 index 000000000..4383fa1b4 --- /dev/null +++ b/demos/TTSCppFrontend/src/front/text_normalize.h @@ -0,0 +1,77 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H +#define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H + +#include +#include +#include +#include +#include +#include "absl/strings/str_split.h" +#include "absl/strings/strip.h" +#include "base/type_conv.h" + +namespace ppspeech { + +class TextNormalizer { + public: + TextNormalizer() { InitMap(); } + ~TextNormalizer() {} + + int InitMap(); + int Replace(std::wstring *sentence, + const int &pos, + const int &len, + const std::wstring &repstr); + int SplitByPunc(const std::wstring &sentence, + std::vector *sentence_part); + + std::string CreateTextValue(const std::string &num, bool use_zero = true); + std::string SingleDigit2Text(const std::string &num_str, + bool alt_one = false); + std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false); + std::string MultiDigit2Text(const std::string &num_str, + bool alt_one = false, + bool use_zero = true); + std::string MultiDigit2Text(const std::wstring &num, + bool alt_one = false, + bool use_zero = true); + std::string Digits2Text(const std::string &num_str); + std::string Digits2Text(const std::wstring &num); + + int ReData(std::wstring *sentence); + int ReData2(std::wstring *sentence); + int ReTime(std::wstring *sentence); + int ReTemperature(std::wstring *sentence); + int ReFrac(std::wstring *sentence); + int RePercentage(std::wstring *sentence); + int ReMobilePhone(std::wstring *sentence); + int RePhone(std::wstring *sentence); + int ReRange(std::wstring *sentence); + int ReInterger(std::wstring *sentence); + int ReDecimalNum(std::wstring *sentence); + int RePositiveQuantifiers(std::wstring *sentence); + int ReDefalutNum(std::wstring *sentence); + int ReNumber(std::wstring *sentence); + int SentenceNormalize(std::wstring *sentence); + + + private: + std::map digits_map; + std::map units_map; +}; +} // namespace ppspeech + +#endif \ No newline at end of file diff --git a/demos/TTSCppFrontend/third-party/CMakeLists.txt b/demos/TTSCppFrontend/third-party/CMakeLists.txt new file mode 100644 index 000000000..0579b8f24 --- /dev/null +++ b/demos/TTSCppFrontend/third-party/CMakeLists.txt @@ -0,0 +1,64 @@ +cmake_minimum_required(VERSION 3.10) +project(tts_third_party_libs) + +include(ExternalProject) + +# gflags +ExternalProject_Add(gflags + GIT_REPOSITORY https://github.com/gflags/gflags.git + GIT_TAG v2.2.2 + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_STATIC_LIBS=OFF + -DBUILD_SHARED_LIBS=ON +) + +# glog +ExternalProject_Add( + glog + GIT_REPOSITORY https://github.com/google/glog.git + GIT_TAG v0.6.0 + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + DEPENDS gflags +) + +# abseil +ExternalProject_Add( + abseil + GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git + GIT_TAG 20230125.1 + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DABSL_PROPAGATE_CXX_STD=ON +) + +# cppjieba (header-only) +ExternalProject_Add( + cppjieba + GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git + GIT_TAG v5.0.3 + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +# limonp (header-only) +ExternalProject_Add( + limonp + GIT_REPOSITORY https://github.com/yanyiwu/limonp.git + GIT_TAG v0.6.6 + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 1d33b694b..c15d0601c 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -9,7 +9,7 @@ This demo is an implementation of starting the streaming speech service and acce Streaming ASR server only support `websocket` protocol, and doesn't support `http` protocol. -服务接口定义请参考: +For service interface definitions, please refer to: - [PaddleSpeech Streaming Server WebSocket API](https://github.com/PaddlePaddle/PaddleSpeech/wiki/PaddleSpeech-Server-WebSocket-API) ## Usage @@ -23,7 +23,7 @@ You can choose one way from easy, meduim and hard to install paddlespeech. **If you install in easy mode, you need to prepare the yaml file by yourself, you can refer to ### 2. Prepare config File -The configuration file can be found in `conf/ws_application.yaml` 和 `conf/ws_conformer_wenetspeech_application.yaml`. +The configuration file can be found in `conf/ws_application.yaml` or `conf/ws_conformer_wenetspeech_application.yaml`. At present, the speech tasks integrated by the model include: DeepSpeech2 and conformer. @@ -87,7 +87,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav server_executor = ServerExecutor() server_executor( - config_file="./conf/ws_conformer_wenetspeech_application.yaml", + config_file="./conf/ws_conformer_wenetspeech_application_faster.yaml", log_file="./log/paddlespeech.log") ``` diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 1902a2fa9..26a6ce404 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -90,7 +90,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav server_executor = ServerExecutor() server_executor( - config_file="./conf/ws_conformer_wenetspeech_application", + config_file="./conf/ws_conformer_wenetspeech_application_faster.yaml", log_file="./log/paddlespeech.log") ``` diff --git a/docs/images/note_map.png b/docs/images/note_map.png new file mode 100644 index 000000000..f280d98c4 Binary files /dev/null and b/docs/images/note_map.png differ diff --git a/docs/requirements.txt b/docs/requirements.txt index c2d56bf91..30622230b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -38,8 +38,8 @@ sphinx-markdown-tables sphinx_rtd_theme textgrid timer -ToJyutping -typeguard +ToJyutping==0.2.1 +typeguard==2.13.3 webrtcvad websockets yacs~=0.1.8 diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 634be7b7f..9e9221779 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -25,7 +25,7 @@ Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions [Wav2vec2-large-960h-lv60-self Model](https://paddlespeech.bj.bcebos.com/wav2vec/wav2vec2-large-960h-lv60-self.pdparams) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | - | 1.18 GB |Pre-trained Wav2vec2.0 Model | - | - | - | [Wav2vec2ASR-large-960h-librispeech Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.1.model.tar.gz) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | Librispeech (960 h) | 718 MB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | - | 0.0189 | [Wav2vecASR Librispeech ASR3](../../examples/librispeech/asr3) | [Wav2vec2-large-wenetspeech-self Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2-large-wenetspeech-self_ckpt_1.3.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | - | 714 MB |Pre-trained Wav2vec2.0 Model | - | - | - | -[Wav2vec2ASR-large-aishell1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | aishell1 (train set) | 1.17 GB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | 0.0453 | - | - | +[Wav2vec2ASR-large-aishell1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | aishell1 (train set) | 1.18 GB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | 0.0510 | - | - | ### Whisper Model Demo Link | Training Data | Size | Descriptions | CER | Model diff --git a/docs/source/tts/svs_music_score.md b/docs/source/tts/svs_music_score.md new file mode 100644 index 000000000..9f351c001 --- /dev/null +++ b/docs/source/tts/svs_music_score.md @@ -0,0 +1,183 @@ +本人非音乐专业人士,如文档中有误欢迎指正。 + +# 一、常见基础 +## 1.1 简谱和音名(note) +

+ +

+ +上图从左往右的黑键音名分别是:C#/Db,D#/Db,F#/Db,G#/Ab,A#/Bb +钢琴88键如下图,分为大字一组,大字组,小字组,小字一组,小字二组,小字三组,小字四组。分别对应音名的后缀是 1 2 3 4 5 6,例如小字一组(C大调)包含的键分别为: C4,C#4/Db4,D4,D#4/Eb4,E4,F4,F#4/Gb4,G4,G#4/Ab4,A4,A#4/Bb4,B4 +钢琴八度音就是12345671八个音,最后一个音是高1。**遵循:全全半全全全半** 就会得到 1 2 3 4 5 6 7 (高)1 的音 + +

+ +

+ +## 1.2 十二大调 +“#”表示升调 + +

+ +

+ +“b”表示降调 + +

+ +

+ +什么大调表示Do(简谱1) 这个音从哪个键开始,例如D大调,则用D这个键来表示 Do这个音。 +下图是十二大调下简谱与音名的对应表。 + +

+ +

+ + +## 1.3 Tempo +Tempo 用于表示速度(Speed of the beat/pulse),一分钟里面有几拍(beats per mimute BPM) + +

+ +

+ +whole note --> 4 beats
+half note --> 2 beats
+quarter note --> 1 beat
+eighth note --> 1/2 beat
+sixteenth note --> 1/4 beat
+ + +# 二、应用试验 +## 2.1 从谱中获取 music scores +music scores 包含:note,note_dur,is_slur + +

+ +

+ +从左上角的谱信息 *bE* 可以得出该谱子是 **降E大调**,可以对应1.2小节十二大调简谱音名对照表根据 简谱获取对应的note +从左上角的谱信息 *quarter note* 可以得出该谱子的速度是 **一分钟95拍(beat)**,一拍的时长 = **60/95 = 0.631578s** +从左上角的谱信息 *4/4* 可以得出该谱子表示四分音符为一拍(分母的4),每小节有4拍(分子的4) + +从该简谱上可以获取 music score 如下: + +|text |phone |简谱(辅助)后面的点表示高八音 |note (从小字组开始算) |几拍(辅助) |note_dur |is_slur| +:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | +|小 |x |5 |A#3/Bb3 |半 |0.315789 |0 | +| |iao |5 |A#3/Bb3 |半 |0.315789 |0 | +|酒 |j |1. |D#4/Eb4 |半 |0.315789 |0 | +| |iu |1. |D#4/Eb4 |半 |0.315789 |0 | +|窝 |w |2. |F4 |半 |0.315789 |0 | +| |o |2. |F4 |半 |0.315789 |0 | +|长 |ch |3. |G4 |半 |0.315789 |0 | +| |ang |3. |G4 |半 |0.315789 |0 | +| |ang |1. |D#4/Eb4 |半 |0.315789 |1 | +|睫 |j |1. |D#4/Eb4 |半 |0.315789 |0 | +| |ie |1. |D#4/Eb4 |半 |0.315789 |0 | +| |ie |5 |A#3/Bb3 |半 |0.315789 |1 | +|毛 |m |5 |A#3/Bb3 |一 |0.631578 |0 | +| |ao |5 |A#3/Bb3 |一 |0.631578 |0 | +|是 |sh |5 |A#3/Bb3 |半 |0.315789 |0 | +| |i |5 |A#3/Bb3 |半 |0.315789 |0 | +|你 |n |3. |G4 |半 |0.315789 |0 | +| |i |3. |G4 |半 |0.315789 |0 | +|最 |z |2. |F4 |半 |0.315789 |0 | +| |ui |2. |F4 |半 |0.315789 |0 | +|美 |m |3. |G4 |半 |0.315789 |0 | +| |ei |3. |G4 |半 |0.315789 |0 | +|的 |d |2. |F4 |半 |0.315789 |0 | +| |e |2. |F4 |半 |0.315789 |0 | +|记 |j |7 |D4 |半 |0.315789 |0 | +| |i |7 |D4 |半 |0.315789 |0 | +|号 |h |5 |A#3/Bb3 |半 |0.315789 |0 | +| |ao |5 |A#3/Bb3 |半 |0.315789 |0 | + + +## 2.2 一些实验 + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
序号 说明 合成音频(diffsinger_opencpop + pwgan_opencpop)
1 原始 opencpop 标注的 notes,note_durs,is_slurs,升F大调,起始在小字组(第3组) + +
+
2 原始 opencpop 标注的 notes 和 is_slurs,note_durs 改变(从谱子获取) + +
+
3 原始 opencpop 标注的 notes 去掉 rest(毛字一拍),is_slurs 和 note_durs 改变(从谱子获取) + +
+
4 从谱子获取 notes,note durs,is_slurs,不含 rest(毛字一拍),起始在小字一组(第3组) + +
+
5 从谱子获取 notes,note durs,is_slurs,加上 rest (毛字半拍,rest半拍),起始在小字一组(第3组) + +
+
6 从谱子获取 notes, is_slurs,包含 rest,note_durs 从原始标注获取,起始在小字一组(第3组) + +
+
7 从谱子获取 notes,note durs,is_slurs,不含 rest(毛字一拍),起始在小字一组(第4组) + +
+
+ +
+ + +上述实验表明通过该方法来提取 music score 是可行的,但是在应用中可以**灵活地在歌词中加"AP"(用来表示吸气声)和"SP"(用来表示停顿声)**,对应的在 **note 上加 rest**,会使得整体的歌声合成更自然。 +除此之外,还要考虑哪一个大调并且以哪一组为起始**得到的 note 在训练数据集中出现过**,如若推理时传入训练数据中没有见过的 note, 合成出来的音频可能不是我们期待的音调。 + + +# 三、其他 +## 3.1 读取midi + +```python +import mido +mid = mido.MidiFile('2093.midi') +``` diff --git a/examples/aishell/asr1/conf/chunk_squeezeformer.yaml b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml new file mode 100644 index 000000000..35a90b7d6 --- /dev/null +++ b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml @@ -0,0 +1,98 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: squeezeformer +encoder_conf: + encoder_dim: 256 # dimension of attention + output_size: 256 # dimension of output + attention_heads: 4 + num_blocks: 12 # the number of encoder blocks + reduce_idx: 5 + recover_idx: 11 + feed_forward_expansion_factor: 8 + input_dropout_rate: 0.1 + feed_forward_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + adaptive_scale: true + cnn_module_kernel: 31 + normalize_before: false + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + time_reduction_layer_type: 'stream' + causal: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + + +########################################### +# Dataloader # +########################################### + +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 1 +global_grad_clip: 5.0 +dist_sampler: True +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/squeezeformer.yaml b/examples/aishell/asr1/conf/squeezeformer.yaml new file mode 100644 index 000000000..b7841aca5 --- /dev/null +++ b/examples/aishell/asr1/conf/squeezeformer.yaml @@ -0,0 +1,93 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: squeezeformer +encoder_conf: + encoder_dim: 256 # dimension of attention + output_size: 256 # dimension of output + attention_heads: 4 + num_blocks: 12 # the number of encoder blocks + reduce_idx: 5 + recover_idx: 11 + feed_forward_expansion_factor: 8 + input_dropout_rate: 0.1 + feed_forward_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + adaptive_scale: true + cnn_module_kernel: 31 + normalize_before: false + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + time_reduction_layer_type: 'conv1d' + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 150 +accum_grad: 8 +global_grad_clip: 5.0 +dist_sampler: False +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md index e5806d621..6b587e12f 100644 --- a/examples/aishell/asr3/README.md +++ b/examples/aishell/asr3/README.md @@ -164,8 +164,8 @@ using the `tar` scripts to unpack the model and then you can use the script to t For example: ```bash -wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz -tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz +tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz source path.sh # If you have process the data and get the manifest file, you can skip the following 2 steps bash local/data.sh --stage -1 --stop_stage -1 @@ -185,14 +185,14 @@ In some situations, you want to use the trained model to do the inference for th ``` you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below: ```bash -wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz -tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz +tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz ``` You can download the audio demo: ```bash -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_01_03.wav ``` diff --git a/examples/aishell/asr3/RESULT.md b/examples/aishell/asr3/RESULT.md new file mode 100644 index 000000000..42edeac11 --- /dev/null +++ b/examples/aishell/asr3/RESULT.md @@ -0,0 +1,18 @@ +# AISHELL + +## Version + +* paddle version: develop (commit id: daea892c67e85da91906864de40ce9f6f1b893ae) +* paddlespeech version: develop (commit id: c14b4238b256693281e59605abff7c9435b3e2b2) +* paddlenlp version: 2.5.2 + +## Device +* python: 3.7 +* cuda: 10.2 +* cudnn: 7.6 + +## Result +train: Epoch 80, 2*V100-32G, batchsize:5 +| Model | Params | Config | Augmentation| Test set | Decode method | WER | +| --- | --- | --- | --- | --- | --- | --- | +| wav2vec2ASR | 324.49 M | conf/wav2vec2ASR.yaml | spec_aug | test-set | greedy search | 5.1009 | diff --git a/examples/aishell/asr3/conf/train_with_wav2vec.yaml b/examples/aishell/asr3/conf/train_with_wav2vec.yaml index 77b3762ef..273175d27 100755 --- a/examples/aishell/asr3/conf/train_with_wav2vec.yaml +++ b/examples/aishell/asr3/conf/train_with_wav2vec.yaml @@ -83,7 +83,7 @@ dnn_neurons: 1024 freeze_wav2vec: False dropout: 0.15 -tokenizer: !apply:transformers.BertTokenizer.from_pretrained +tokenizer: !apply:paddlenlp.transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: bert-base-chinese # bert-base-chinese tokens length output_neurons: 21128 diff --git a/examples/aishell/asr3/conf/wav2vec2ASR.yaml b/examples/aishell/asr3/conf/wav2vec2ASR.yaml old mode 100755 new mode 100644 index cdb04f8c1..4a1274688 --- a/examples/aishell/asr3/conf/wav2vec2ASR.yaml +++ b/examples/aishell/asr3/conf/wav2vec2ASR.yaml @@ -107,6 +107,7 @@ vocab_filepath: data/lang_char/vocab.txt ########################################### unit_type: 'char' +tokenizer: bert-base-chinese mean_std_filepath: preprocess_config: conf/preprocess.yaml sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs @@ -139,12 +140,10 @@ n_epoch: 80 accum_grad: 1 global_grad_clip: 5.0 -model_optim: adadelta +model_optim: sgd model_optim_conf: lr: 1.0 weight_decay: 0.0 - rho: 0.95 - epsilon: 1.0e-8 wav2vec2_optim: adam wav2vec2_optim_conf: @@ -165,3 +164,4 @@ log_interval: 1 checkpoint: kbest_n: 50 latest_n: 5 + diff --git a/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml new file mode 100755 index 000000000..ec287f0c6 --- /dev/null +++ b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml @@ -0,0 +1,168 @@ +############################################ +# Network Architecture # +############################################ +freeze_wav2vec2: False +normalize_wav: True +output_norm: True +init_type: 'kaiming_uniform' # !Warning: need to convergence +enc: + input_shape: 1024 + dnn_blocks: 3 + dnn_neurons: 1024 + activation: True + normalization: True + dropout_rate: [0.15, 0.15, 0.0] +ctc: + enc_n_units: 1024 + blank_id: 0 + dropout_rate: 0.0 + +audio_augment: + speeds: [90, 100, 110] + +spec_augment: + time_warp: True + time_warp_window: 5 + time_warp_mode: bicubic + freq_mask: True + n_freq_mask: 2 + time_mask: True + n_time_mask: 2 + replace_with_zero: False + freq_mask_width: 30 + time_mask_width: 40 +wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams + + +############################################ +# Wav2Vec2.0 # +############################################ +# vocab_size: 1000000 +hidden_size: 1024 +num_hidden_layers: 24 +num_attention_heads: 16 +intermediate_size: 4096 +hidden_act: gelu +hidden_dropout: 0.1 +activation_dropout: 0.0 +attention_dropout: 0.1 +feat_proj_dropout: 0.1 +feat_quantizer_dropout: 0.0 +final_dropout: 0.0 +layerdrop: 0.1 +initializer_range: 0.02 +layer_norm_eps: 1e-5 +feat_extract_norm: layer +feat_extract_activation: gelu +conv_dim: [512, 512, 512, 512, 512, 512, 512] +conv_stride: [5, 2, 2, 2, 2, 2, 2] +conv_kernel: [10, 3, 3, 3, 3, 2, 2] +conv_bias: True +num_conv_pos_embeddings: 128 +num_conv_pos_embedding_groups: 16 +do_stable_layer_norm: True +apply_spec_augment: False +mask_channel_length: 10 +mask_channel_min_space: 1 +mask_channel_other: 0.0 +mask_channel_prob: 0.0 +mask_channel_selection: static +mask_feature_length: 10 +mask_feature_min_masks: 0 +mask_feature_prob: 0.0 +mask_time_length: 10 +mask_time_min_masks: 2 +mask_time_min_space: 1 +mask_time_other: 0.0 +mask_time_prob: 0.075 +mask_time_selection: static +num_codevectors_per_group: 320 +num_codevector_groups: 2 +contrastive_logits_temperature: 0.1 +num_negatives: 100 +codevector_dim: 256 +proj_codevector_dim: 256 +diversity_loss_weight: 0.1 +use_weighted_layer_sum: False +# pad_token_id: 0 +# bos_token_id: 1 +# eos_token_id: 2 +add_adapter: False +adapter_kernel_size: 3 +adapter_stride: 2 +num_adapter_layers: 3 +output_hidden_size: None + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +vocab_filepath: data/lang_char/vocab.txt + +########################################### +# Dataloader # +########################################### + +unit_type: 'char' +tokenizer: bert-base-chinese +mean_std_filepath: +preprocess_config: conf/preprocess.yaml +sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 5 # Different batch_size may cause large differences in results +maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced +maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 6 +subsampling_factor: 1 +num_encs: 1 +dist_sampler: True +shortest_first: True +return_lens_rate: True + +########################################### +# use speechbrain dataloader # +########################################### +use_sb_pipeline: True # whether use speechbrain pipeline. Default is True. +sb_pipeline_conf: conf/train_with_wav2vec.yaml + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +global_grad_clip: 5.0 + +model_optim: adadelta +model_optim_conf: + lr: 1.0 + weight_decay: 0.0 + rho: 0.95 + epsilon: 1.0e-8 + +wav2vec2_optim: adam +wav2vec2_optim_conf: + lr: 0.0001 + weight_decay: 0.0 + +model_scheduler: newbobscheduler +model_scheduler_conf: + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +wav2vec2_scheduler: newbobscheduler +wav2vec2_scheduler_conf: + improvement_threshold: 0.0025 + annealing_factor: 0.9 + patient: 0 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr3/local/aishell_prepare.py b/examples/aishell/asr3/local/aishell_prepare.py index a25735791..2a7ba5c6c 100644 --- a/examples/aishell/asr3/local/aishell_prepare.py +++ b/examples/aishell/asr3/local/aishell_prepare.py @@ -21,7 +21,7 @@ import glob import logging import os -from paddlespeech.s2t.models.wav2vec2.io.dataio import read_audio +from paddlespeech.s2t.io.speechbrain.dataio import read_audio logger = logging.getLogger(__name__) diff --git a/examples/aishell/asr3/local/data.sh b/examples/aishell/asr3/local/data.sh index 1a468f546..bd26c1e78 100755 --- a/examples/aishell/asr3/local/data.sh +++ b/examples/aishell/asr3/local/data.sh @@ -1,7 +1,7 @@ #!/bin/bash stage=-1 -stop_stage=-1 +stop_stage=3 dict_dir=data/lang_char . ${MAIN_ROOT}/utils/parse_options.sh || exit -1; diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh index 9d4b84291..91e1c5457 100755 --- a/examples/aishell/asr3/local/test.sh +++ b/examples/aishell/asr3/local/test.sh @@ -8,9 +8,7 @@ echo "using $ngpu gpus..." expdir=exp datadir=data -train_set=train_960 -recog_set="test-clean test-other dev-clean dev-other" -recog_set="test-clean" +train_set=train config_path=$1 decode_config_path=$2 @@ -75,7 +73,7 @@ for type in ctc_prefix_beam_search; do --trans_hyp ${ckpt_prefix}.${type}.rsl.text python3 utils/compute-wer.py --char=1 --v=1 \ - data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error + data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done diff --git a/examples/aishell/asr3/local/test_wav.sh b/examples/aishell/asr3/local/test_wav.sh index fdf3589f4..7ccef6945 100755 --- a/examples/aishell/asr3/local/test_wav.sh +++ b/examples/aishell/asr3/local/test_wav.sh @@ -14,7 +14,7 @@ ckpt_prefix=$3 audio_file=$4 mkdir -p data -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/aishell/asr3/run.sh b/examples/aishell/asr3/run.sh index 9b0a3c472..557ca0fcd 100755 --- a/examples/aishell/asr3/run.sh +++ b/examples/aishell/asr3/run.sh @@ -15,11 +15,11 @@ resume= # xx e.g. 30 export FLAGS_cudnn_deterministic=1 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -audio_file=data/demo_002_en.wav +audio_file=data/demo_01_03.wav avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') -echo "checkpoint name ${ckpt}"git revert -v +echo "checkpoint name ${ckpt}" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index b5da076b2..8dcecaa03 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -43,10 +43,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_aishell3 # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh index 3a3dfe0a5..acfc50223 100755 --- a/examples/canton/tts3/run.sh +++ b/examples/canton/tts3/run.sh @@ -46,10 +46,7 @@ fi # we have only tested the following models so far if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton # considering the balance between speed and quality, we recommend that you use hifigan as vocoder # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh index 6279ec579..5732ea3c7 100755 --- a/examples/csmsc/tts2/run.sh +++ b/examples/csmsc/tts2/run.sh @@ -45,10 +45,7 @@ fi # we have only tested the following models so far if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index dd8c9f3e6..a7b4e4239 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -45,10 +45,7 @@ fi # we have only tested the following models so far if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh index 96b446c52..f356f3133 100755 --- a/examples/csmsc/tts3/run_cnndecoder.sh +++ b/examples/csmsc/tts3/run_cnndecoder.sh @@ -58,10 +58,7 @@ fi # paddle2onnx non streaming if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc @@ -77,10 +74,7 @@ fi # paddle2onnx streaming if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade # streaming acoustic model ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder diff --git a/examples/csmsc/vits/local/paddle2onnx.sh b/examples/csmsc/vits/local/paddle2onnx.sh new file mode 120000 index 000000000..87c46634d --- /dev/null +++ b/examples/csmsc/vits/local/paddle2onnx.sh @@ -0,0 +1 @@ +../../tts3/local/paddle2onnx.sh \ No newline at end of file diff --git a/examples/csmsc/vits/run.sh b/examples/csmsc/vits/run.sh index ac190bfa8..03c59702b 100755 --- a/examples/csmsc/vits/run.sh +++ b/examples/csmsc/vits/run.sh @@ -39,3 +39,31 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ${add_blank}|| exit -1 fi + +# # not ready yet for operator missing in Paddle2ONNX +# # paddle2onnx, please make sure the static models are in ${train_output_path}/inference first +# # we have only tested the following models so far +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # install paddle2onnx +# pip install paddle2onnx --upgrade +# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx vits_csmsc +# fi + +# # inference with onnxruntime +# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +# ./local/ort_predict.sh ${train_output_path} +# fi + +# # not ready yet for operator missing in Paddle-Lite +# # must run after stage 3 (which stage generated static models) +# if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then +# # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models, +# # cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 +# # and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706 +# ./local/export2lite.sh ${train_output_path} inference pdlite vits_csmsc x86 +# fi + +# if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then +# CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +# fi + diff --git a/examples/librispeech/asr3/local/data.sh b/examples/librispeech/asr3/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/local/test.sh b/examples/librispeech/asr3/local/test.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/local/test_wav.sh b/examples/librispeech/asr3/local/test_wav.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/local/train.sh b/examples/librispeech/asr3/local/train.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/run.sh b/examples/librispeech/asr3/run.sh index 05ad505c7..f52266a1a 100644 --- a/examples/librispeech/asr3/run.sh +++ b/examples/librispeech/asr3/run.sh @@ -6,7 +6,7 @@ set -e gpus=0 stage=0 -stop_stage=0 +stop_stage=4 conf_path=conf/wav2vec2ASR.yaml ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh index aacd4cc03..0d8da920c 100755 --- a/examples/ljspeech/tts3/run.sh +++ b/examples/ljspeech/tts3/run.sh @@ -45,10 +45,7 @@ fi # we have only tested the following models so far if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_ljspeech # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_ljspeech diff --git a/examples/opencpop/README.md b/examples/opencpop/README.md new file mode 100644 index 000000000..5a574dc80 --- /dev/null +++ b/examples/opencpop/README.md @@ -0,0 +1,6 @@ + +# Opencpop + +* svs1 - DiffSinger +* voc1 - Parallel WaveGAN +* voc5 - HiFiGAN diff --git a/examples/opencpop/svs1/README.md b/examples/opencpop/svs1/README.md new file mode 100644 index 000000000..1600d0c76 --- /dev/null +++ b/examples/opencpop/svs1/README.md @@ -0,0 +1,276 @@ +([简体中文](./README_cn.md)|English) +# DiffSinger with Opencpop +This example contains code used to train a [DiffSinger](https://arxiv.org/abs/2105.02446) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/). + +## Dataset +### Download and Extract +Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`. + +## Get Started +Assume the path to the dataset is `~/datasets/Opencpop`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - (Supporting) synthesize waveform from a text file. +5. (Supporting) inference using the static model. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── energy_stats.npy + ├── norm + ├── pitch_stats.npy + ├── raw + ├── speech_stats.npy + └── speech_stretchs.npy + +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech, pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. `speech_stretchs.npy` contains the minimum and maximum values of each dimension of the mel spectrum, which is used for linear stretching before training/inference of the diffusion module. +Note: Since the training effect of non-norm features is due to norm, the features saved under `norm` are features that have not been normed. + + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains utterance id, speaker id, phones, text_lengths, speech_lengths, phone durations, the path of speech features, the path of pitch features, the path of energy features, note, note durations, slur. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS] + +Train a FastSpeech2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG diffsinger config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu=0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. + --speech-stretchs SPEECH_STRETCHS + min amd max mel for stretching. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. +6. `--speech-stretchs` is the path of mel's min-max data file. + +### Synthesizing +We use parallel wavegan as the neural vocoder. +Download pretrained parallel wavegan model from [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) and unzip it. +```bash +unzip pwgan_opencpop_ckpt_1.4.0.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwgan_opencpop_ckpt_1.4.0.zip +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_100000.pdz # model parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {diffsinger_opencpop}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--voc {pwgan_opencpop}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + [--speech_stretchs SPEECH_STRETCHS] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} + Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} + Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. + --speech-stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. +``` + +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +`local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop. + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + [--pinyin_phone PINYIN_PHONE] + [--speech_stretchs SPEECH_STRETCHS] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} + Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} + Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG {zh, en, mix, canton} Choose language type of tts task. + {sing} Choose language type of svs task. + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize file, a 'utt_id sentence' pair per line for tts task. + A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task. + --output_dir OUTPUT_DIR + output dir. + --pinyin_phone PINYIN_PHONE + pinyin to phone map file, using on sing_frontend. + --speech_stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the diffsinger pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is language. `zh`, `en`, `mix` and `canton` for tts task. `sing` for tts task. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +10. `--inference_dir` is the directory to save static models. If this line is not added, it will not be generated and saved as a static model. +11. `--pinyin_phone` pinyin to phone map file, using on sing_frontend. +12. `--speech_stretchs` The min and max values of the mel spectrum, using on diffusion of diffsinger. + +Note: At present, the diffsinger model does not support dynamic to static, so do not add `--inference_dir`. + + +## Pretrained Model +Pretrained DiffSinger model: +- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip) + +DiffSinger checkpoint contains files listed below. +```text +diffsinger_opencpop_ckpt_1.4.0.zip +├── default.yaml # default config used to train diffsinger +├── energy_stats.npy # statistics used to normalize energy when training diffsinger if norm is needed +├── phone_id_map.txt # phone vocabulary file when training diffsinger +├── pinyin_to_phone.txt # pinyin-to-phoneme mapping file when training diffsinger +├── pitch_stats.npy # statistics used to normalize pitch when training diffsinger if norm is needed +├── snapshot_iter_160000.pdz # model parameters of diffsinger +├── speech_stats.npy # statistics used to normalize mel when training diffsinger if norm is needed +└── speech_stretchs.npy # min and max values to use for mel spectral stretching before training diffusion + +``` + +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_sing.txt` using pretrained diffsinger and parallel wavegan models. + +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \ + --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \ + --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \ + --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \ + --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy + +``` diff --git a/examples/opencpop/svs1/README_cn.md b/examples/opencpop/svs1/README_cn.md new file mode 100644 index 000000000..1435b42ec --- /dev/null +++ b/examples/opencpop/svs1/README_cn.md @@ -0,0 +1,280 @@ +(简体中文|[English](./README.md)) +# 用 Opencpop 数据集训练 DiffSinger 模型 + +本用例包含用于训练 [DiffSinger](https://arxiv.org/abs/2105.02446) 模型的代码,使用 [Mandarin singing corpus](https://wenet.org.cn/opencpop/) 数据集。 + +## 数据集 +### 下载并解压 +从 [官方网站](https://wenet.org.cn/opencpop/download/) 下载数据集 + +## 开始 +假设数据集的路径是 `~/datasets/Opencpop`. +运行下面的命令会进行如下操作: + +1. **设置原路径**。 +2. 对数据集进行预处理。 +3. 训练模型 +4. 合成波形 + - 从 `metadata.jsonl` 合成波形。 + - (支持中)从文本文件合成波形。 +5. (支持中)使用静态模型进行推理。 +```bash +./run.sh +``` +您可以选择要运行的一系列阶段,或者将 `stage` 设置为 `stop-stage` 以仅使用一个阶段,例如,运行以下命令只会预处理数据集。 +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### 数据预处理 +```bash +./local/preprocess.sh ${conf_path} +``` +当它完成时。将在当前目录中创建 `dump` 文件夹。转储文件夹的结构如下所示。 + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── energy_stats.npy + ├── norm + ├── pitch_stats.npy + ├── raw + ├── speech_stats.npy + └── speech_stretchs.npy +``` + +数据集分为三个部分,即 `train` 、 `dev` 和 `test` ,每个部分都包含一个 `norm` 和 `raw` 子文件夹。原始文件夹包含每个话语的语音、音调和能量特征,而 `norm` 文件夹包含规范化的特征。用于规范化特征的统计数据是从 `dump/train/*_stats.npy` 中的训练集计算出来的。`speech_stretchs.npy` 中包含 mel谱每个维度上的最小值和最大值,用于 diffusion 模块训练/推理前的线性拉伸。 +注意:由于非 norm 特征训练效果由于 norm,因此 `norm` 下保存的特征是未经过 norm 的特征。 + + +此外,还有一个 `metadata.jsonl` 在每个子文件夹中。它是一个类似表格的文件,包含话语id,音色id,音素、文本长度、语音长度、音素持续时间、语音特征路径、音调特征路径、能量特征路径、音调,音调持续时间,是否为转音。 + +### 模型训练 +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` 调用 `${BIN_DIR}/train.py` 。 +以下是完整的帮助信息。 + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS] + +Train a DiffSinger model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG diffsinger config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu=0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. + --speech-stretchs SPEECH_STRETCHS + min amd max mel for stretching. +``` +1. `--config` 是一个 yaml 格式的配置文件,用于覆盖默认配置,位于 `conf/default.yaml`. +2. `--train-metadata` 和 `--dev-metadata` 应为 `dump` 文件夹中 `train` 和 `dev` 下的规范化元数据文件 +3. `--output-dir` 是保存结果的目录。 检查点保存在此目录中的 `checkpoints/` 目录下。 +4. `--ngpu` 要使用的 GPU 数,如果 ngpu==0,则使用 cpu 。 +5. `--phones-dict` 是音素词汇表文件的路径。 +6. `--speech-stretchs` mel的最小最大值数据的文件路径。 + +### 合成 +我们使用 parallel opencpop 作为神经声码器(vocoder)。 +从 [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) 下载预训练的 parallel wavegan 模型并将其解压。 + +```bash +unzip pwgan_opencpop_ckpt_1.4.0.zip +``` +Parallel WaveGAN 检查点包含如下文件。 +```text +pwgan_opencpop_ckpt_1.4.0.zip +├── default.yaml # 用于训练 parallel wavegan 的默认配置 +├── snapshot_iter_100000.pdz # parallel wavegan 的模型参数 +└── feats_stats.npy # 训练平行波形时用于规范化谱图的统计数据 +``` +`./local/synthesize.sh` 调用 `${BIN_DIR}/../synthesize.py` 即可从 `metadata.jsonl`中合成波形。 + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {diffsinger_opencpop}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--voc {pwgan_opencpop}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + [--speech_stretchs SPEECH_STRETCHS] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} + Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} + Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. + --speech-stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. +``` + +`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`,即可从文本文件中合成波形。 +`local/pinyin_to_phone.txt`来源于opencpop数据集中的README,表示opencpop中拼音到音素的映射。 + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + [--pinyin_phone PINYIN_PHONE] + [--speech_stretchs SPEECH_STRETCHS] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} + Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} + Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG {zh, en, mix, canton} Choose language type of tts task. + {sing} Choose language type of svs task. + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize file, a 'utt_id sentence' pair per line for tts task. + A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task. + --output_dir OUTPUT_DIR + output dir. + --pinyin_phone PINYIN_PHONE + pinyin to phone map file, using on sing_frontend. + --speech_stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. +``` +1. `--am` 声学模型格式是否符合 {model_name}_{dataset} +2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 diffsinger 预训练模型中的 4 个文件。 +3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +5. `--lang` tts对应模型的语言可以是 `zh`、`en`、`mix`和`canton`。 svs 对应的语言是 `sing` 。 +6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 +7. `--text` 是文本文件,其中包含要合成的句子。 +8. `--output_dir` 是保存合成音频文件的目录。 +9. `--ngpu` 要使用的GPU数,如果 ngpu==0,则使用 cpu。 +10. `--inference_dir` 静态模型保存的目录。如果不加这一行,就不会生并保存成静态模型。 +11. `--pinyin_phone` 拼音到音素的映射文件。 +12. `--speech_stretchs` mel谱的最大最小值用于diffsinger中diffusion之前的线性拉伸。 + +注意: 目前 diffsinger 模型还不支持动转静,所以不要加 `--inference_dir`。 + + +## 预训练模型 +预先训练的 DiffSinger 模型: +- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip) + + +DiffSinger 检查点包含下列文件。 +```text +diffsinger_opencpop_ckpt_1.4.0.zip +├── default.yaml # 用于训练 diffsinger 的默认配置 +├── energy_stats.npy # 训练 diffsinger 时如若需要 norm energy 会使用到的统计数据 +├── phone_id_map.txt # 训练 diffsinger 时的音素词汇文件 +├── pinyin_to_phone.txt # 训练 diffsinger 时的拼音到音素映射文件 +├── pitch_stats.npy # 训练 diffsinger 时如若需要 norm pitch 会使用到的统计数据 +├── snapshot_iter_160000.pdz # 模型参数和优化器状态 +├── speech_stats.npy # 训练 diffsinger 时用于规范化频谱图的统计数据 +└── speech_stretchs.npy # 训练 diffusion 前用于 mel 谱拉伸的最小及最大值 + +``` +您可以使用以下脚本通过使用预训练的 diffsinger 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences_sing.txt` 合成句子 +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \ + --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \ + --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \ + --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \ + --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy + +``` diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml new file mode 100644 index 000000000..5d8060630 --- /dev/null +++ b/examples/opencpop/svs1/conf/default.yaml @@ -0,0 +1,159 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 30 # Minimum frequency of Mel basis. +fmax: 12000 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 750 # Maximum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 48 # batch size +num_workers: 1 # number of gpu + + +########################################################### +# MODEL SETTING # +########################################################### +model: + # music score related + note_num: 300 # number of note + is_slur_num: 2 # number of slur + # fastspeech2 module options + use_energy_pred: False # whether use energy predictor + use_postnet: False # whether use postnet + + # fastspeech2 module + fastspeech2_params: + adim: 256 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1024 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1024 # number of decoder ff units + positionwise_layer_type: conv1d-linear # type of position-wise layer + positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer + transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer + transformer_activation_type: "gelu" # Activation function type in transformer. + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + init_type: xavier_uniform # initialization type + init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding + init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding + use_scaled_pos_enc: True # whether to use scaled positional encoding + transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer + duration_predictor_layers: 5 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder + postnet_layers: 5 # number of layers of postnet + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + postnet_dropout_rate: 0.5 # dropout rate for postnet + + # denoiser module + denoiser_params: + in_channels: 80 # Number of channels of the input mel-spectrogram + out_channels: 80 # Number of channels of the output mel-spectrogram + kernel_size: 3 # Kernel size of the residual blocks inside + layers: 20 # Number of residual blocks inside + stacks: 5 # The number of groups to split the residual blocks into + residual_channels: 256 # Residual channel of the residual blocks + gate_channels: 512 # Gate channel of the residual blocks + skip_channels: 256 # Skip channel of the residual blocks + aux_channels: 256 # Auxiliary channel of the residual blocks + dropout: 0.1 # Dropout of the residual blocks + bias: True # Whether to use bias in residual blocks + use_weight_norm: False # Whether to use weight norm in all convolutions + init_type: "kaiming_normal" # Type of initialize weights of a neural network module + + + diffusion_params: + num_train_timesteps: 100 # The number of timesteps between the noise and the real during training + beta_start: 0.0001 # beta start parameter for the scheduler + beta_end: 0.06 # beta end parameter for the scheduler + beta_schedule: "linear" # beta schedule parameter for the scheduler + num_max_timesteps: 100 # The max timestep transition from real to noise + stretch: True # whether to stretch before diffusion + + +########################################################### +# UPDATER SETTING # +########################################################### +fs2_updater: + use_masking: True # whether to apply masking for padded part in loss calculation + +ds_updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +# fastspeech2 optimizer +fs2_optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +# diffusion optimizer +ds_optimizer_params: + beta1: 0.9 + beta2: 0.98 + weight_decay: 0.0 + +ds_scheduler_params: + learning_rate: 0.001 + gamma: 0.5 + step_size: 50000 +ds_grad_norm: 1 + + +########################################################### +# INTERVAL SETTING # +########################################################### +only_train_diffusion: True # Whether to freeze fastspeech2 parameters when training diffusion +ds_train_start_steps: 160000 # Number of steps to start to train diffusion module. +train_max_steps: 320000 # Number of training steps. +save_interval_steps: 2000 # Interval steps to save checkpoint. +eval_interval_steps: 2000 # Interval steps to evaluate the network. +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/opencpop/svs1/local/pinyin_to_phone.txt b/examples/opencpop/svs1/local/pinyin_to_phone.txt new file mode 100644 index 000000000..34ed079d7 --- /dev/null +++ b/examples/opencpop/svs1/local/pinyin_to_phone.txt @@ -0,0 +1,418 @@ +a|a +ai|ai +an|an +ang|ang +ao|ao +ba|b a +bai|b ai +ban|b an +bang|b ang +bao|b ao +bei|b ei +ben|b en +beng|b eng +bi|b i +bian|b ian +biao|b iao +bie|b ie +bin|b in +bing|b ing +bo|b o +bu|b u +ca|c a +cai|c ai +can|c an +cang|c ang +cao|c ao +ce|c e +cei|c ei +cen|c en +ceng|c eng +cha|ch a +chai|ch ai +chan|ch an +chang|ch ang +chao|ch ao +che|ch e +chen|ch en +cheng|ch eng +chi|ch i +chong|ch ong +chou|ch ou +chu|ch u +chua|ch ua +chuai|ch uai +chuan|ch uan +chuang|ch uang +chui|ch ui +chun|ch un +chuo|ch uo +ci|c i +cong|c ong +cou|c ou +cu|c u +cuan|c uan +cui|c ui +cun|c un +cuo|c uo +da|d a +dai|d ai +dan|d an +dang|d ang +dao|d ao +de|d e +dei|d ei +den|d en +deng|d eng +di|d i +dia|d ia +dian|d ian +diao|d iao +die|d ie +ding|d ing +diu|d iu +dong|d ong +dou|d ou +du|d u +duan|d uan +dui|d ui +dun|d un +duo|d uo +e|e +ei|ei +en|en +eng|eng +er|er +fa|f a +fan|f an +fang|f ang +fei|f ei +fen|f en +feng|f eng +fo|f o +fou|f ou +fu|f u +ga|g a +gai|g ai +gan|g an +gang|g ang +gao|g ao +ge|g e +gei|g ei +gen|g en +geng|g eng +gong|g ong +gou|g ou +gu|g u +gua|g ua +guai|g uai +guan|g uan +guang|g uang +gui|g ui +gun|g un +guo|g uo +ha|h a +hai|h ai +han|h an +hang|h ang +hao|h ao +he|h e +hei|h ei +hen|h en +heng|h eng +hm|h m +hng|h ng +hong|h ong +hou|h ou +hu|h u +hua|h ua +huai|h uai +huan|h uan +huang|h uang +hui|h ui +hun|h un +huo|h uo +ji|j i +jia|j ia +jian|j ian +jiang|j iang +jiao|j iao +jie|j ie +jin|j in +jing|j ing +jiong|j iong +jiu|j iu +ju|j v +juan|j van +jue|j ve +jun|j vn +ka|k a +kai|k ai +kan|k an +kang|k ang +kao|k ao +ke|k e +kei|k ei +ken|k en +keng|k eng +kong|k ong +kou|k ou +ku|k u +kua|k ua +kuai|k uai +kuan|k uan +kuang|k uang +kui|k ui +kun|k un +kuo|k uo +la|l a +lai|l ai +lan|l an +lang|l ang +lao|l ao +le|l e +lei|l ei +leng|l eng +li|l i +lia|l ia +lian|l ian +liang|l iang +liao|l iao +lie|l ie +lin|l in +ling|l ing +liu|l iu +lo|l o +long|l ong +lou|l ou +lu|l u +luan|l uan +lun|l un +luo|l uo +lv|l v +lve|l ve +m|m +ma|m a +mai|m ai +man|m an +mang|m ang +mao|m ao +me|m e +mei|m ei +men|m en +meng|m eng +mi|m i +mian|m ian +miao|m iao +mie|m ie +min|m in +ming|m ing +miu|m iu +mo|m o +mou|m ou +mu|m u +n|n +na|n a +nai|n ai +nan|n an +nang|n ang +nao|n ao +ne|n e +nei|n ei +nen|n en +neng|n eng +ng|n g +ni|n i +nian|n ian +niang|n iang +niao|n iao +nie|n ie +nin|n in +ning|n ing +niu|n iu +nong|n ong +nou|n ou +nu|n u +nuan|n uan +nun|n un +nuo|n uo +nv|n v +nve|n ve +o|o +ou|ou +pa|p a +pai|p ai +pan|p an +pang|p ang +pao|p ao +pei|p ei +pen|p en +peng|p eng +pi|p i +pian|p ian +piao|p iao +pie|p ie +pin|p in +ping|p ing +po|p o +pou|p ou +pu|p u +qi|q i +qia|q ia +qian|q ian +qiang|q iang +qiao|q iao +qie|q ie +qin|q in +qing|q ing +qiong|q iong +qiu|q iu +qu|q v +quan|q van +que|q ve +qun|q vn +ran|r an +rang|r ang +rao|r ao +re|r e +ren|r en +reng|r eng +ri|r i +rong|r ong +rou|r ou +ru|r u +rua|r ua +ruan|r uan +rui|r ui +run|r un +ruo|r uo +sa|s a +sai|s ai +san|s an +sang|s ang +sao|s ao +se|s e +sen|s en +seng|s eng +sha|sh a +shai|sh ai +shan|sh an +shang|sh ang +shao|sh ao +she|sh e +shei|sh ei +shen|sh en +sheng|sh eng +shi|sh i +shou|sh ou +shu|sh u +shua|sh ua +shuai|sh uai +shuan|sh uan +shuang|sh uang +shui|sh ui +shun|sh un +shuo|sh uo +si|s i +song|s ong +sou|s ou +su|s u +suan|s uan +sui|s ui +sun|s un +suo|s uo +ta|t a +tai|t ai +tan|t an +tang|t ang +tao|t ao +te|t e +tei|t ei +teng|t eng +ti|t i +tian|t ian +tiao|t iao +tie|t ie +ting|t ing +tong|t ong +tou|t ou +tu|t u +tuan|t uan +tui|t ui +tun|t un +tuo|t uo +wa|w a +wai|w ai +wan|w an +wang|w ang +wei|w ei +wen|w en +weng|w eng +wo|w o +wu|w u +xi|x i +xia|x ia +xian|x ian +xiang|x iang +xiao|x iao +xie|x ie +xin|x in +xing|x ing +xiong|x iong +xiu|x iu +xu|x v +xuan|x van +xue|x ve +xun|x vn +ya|y a +yan|y an +yang|y ang +yao|y ao +ye|y e +yi|y i +yin|y in +ying|y ing +yo|y o +yong|y ong +you|y ou +yu|y v +yuan|y van +yue|y ve +yun|y vn +za|z a +zai|z ai +zan|z an +zang|z ang +zao|z ao +ze|z e +zei|z ei +zen|z en +zeng|z eng +zha|zh a +zhai|zh ai +zhan|zh an +zhang|zh ang +zhao|zh ao +zhe|zh e +zhei|zh ei +zhen|zh en +zheng|zh eng +zhi|zh i +zhong|zh ong +zhou|zh ou +zhu|zh u +zhua|zh ua +zhuai|zh uai +zhuan|zh uan +zhuang|zh uang +zhui|zh ui +zhun|zh un +zhuo|zh uo +zi|z i +zong|z ong +zou|z ou +zu|z u +zuan|z uan +zui|z ui +zun|z un +zuo|z uo \ No newline at end of file diff --git a/examples/opencpop/svs1/local/preprocess.sh b/examples/opencpop/svs1/local/preprocess.sh new file mode 100755 index 000000000..26fd44689 --- /dev/null +++ b/examples/opencpop/svs1/local/preprocess.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=opencpop \ + --rootdir=~/datasets/Opencpop/segments \ + --dumpdir=dump \ + --label-file=~/datasets/Opencpop/segments/transcriptions.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="pitch" + + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="energy" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize and covert phone/speaker to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # Get feature(mel) extremum for diffusion stretch + echo "Get feature(mel) extremum ..." + python3 ${BIN_DIR}/get_minmax.py \ + --metadata=dump/train/norm/metadata.jsonl \ + --speech-stretchs=dump/train/speech_stretchs.npy +fi diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh new file mode 100755 index 000000000..1159e0074 --- /dev/null +++ b/examples/opencpop/svs1/local/synthesize.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=diffsinger_opencpop \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --speech_stretchs=dump/train/speech_stretchs.npy +fi + diff --git a/examples/opencpop/svs1/local/synthesize_e2e.sh b/examples/opencpop/svs1/local/synthesize_e2e.sh new file mode 100755 index 000000000..b3dc29b11 --- /dev/null +++ b/examples/opencpop/svs1/local/synthesize_e2e.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --speech_stretchs=dump/train/speech_stretchs.npy \ + --pinyin_phone=local/pinyin_to_phone.txt +fi + +# for more GAN Vocoders +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_opencpop \ + --voc_config=hifigan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \ + --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --speech_stretchs=dump/train/speech_stretchs.npy \ + --pinyin_phone=local/pinyin_to_phone.txt + +fi diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh new file mode 100755 index 000000000..5be624fc4 --- /dev/null +++ b/examples/opencpop/svs1/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 \ + --phones-dict=dump/phone_id_map.txt \ + --speech-stretchs=dump/train/speech_stretchs.npy diff --git a/examples/opencpop/svs1/path.sh b/examples/opencpop/svs1/path.sh new file mode 100755 index 000000000..8bda5dce6 --- /dev/null +++ b/examples/opencpop/svs1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=diffsinger +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh new file mode 100755 index 000000000..bfe5b6594 --- /dev/null +++ b/examples/opencpop/svs1/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_320000.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/opencpop/voc1/README.md b/examples/opencpop/voc1/README.md new file mode 100644 index 000000000..37570a648 --- /dev/null +++ b/examples/opencpop/voc1/README.md @@ -0,0 +1,139 @@ +# Parallel WaveGAN with Opencpop +This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/). + +## Dataset +### Download and Extract +Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`. + +## Get Started +Assume the path to the dataset is `~/datasets/Opencpop`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── feats_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] + +Train a ParallelWaveGAN model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG ParallelWaveGAN config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + +benchmark: + arguments related to benchmark. + + --batch-size BATCH_SIZE + batch size. + --max-iter MAX_ITER train max steps. + --run-benchmark RUN_BENCHMARK + runing benchmark or not, if True, use the --batch-size + and --max-iter. + --profiler_options PROFILER_OPTIONS + The option of profiler, which should be in format + "key1=value1;key2=value2;key3=value3". +``` + +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Synthesizing +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] + [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] + [--output-dir OUTPUT_DIR] [--ngpu NGPU] + +Synthesize with GANVocoder. + +optional arguments: + -h, --help show this help message and exit + --generator-type GENERATOR_TYPE + type of GANVocoder, should in {pwgan, mb_melgan, + style_melgan, } now + --config CONFIG GANVocoder config file. + --checkpoint CHECKPOINT + snapshot to load. + --test-metadata TEST_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. +``` + +1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. +3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. +4. `--output-dir` is the directory to save the synthesized audio files. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here: +- [pwgan_opencpop_ckpt_1.4.0](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) + + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwgan_opencpop_ckpt_1.4.0 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_100000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/opencpop/voc1/conf/default.yaml b/examples/opencpop/voc1/conf/default.yaml new file mode 100644 index 000000000..ee99719dc --- /dev/null +++ b/examples/opencpop/voc1/conf/default.yaml @@ -0,0 +1,119 @@ +# This is the hyperparameter configuration file for Parallel WaveGAN. +# Please make sure this is adjusted for the CSMSC dataset. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 30 # Minimum freq in mel basis calculation. (Hz) +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) + + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Kernel size of dilated convolution. + layers: 30 # Number of residual block layers. + stacks: 3 # Number of stacks i.e., dilation cycles. + residual_channels: 64 # Number of channels in residual conv. + gate_channels: 128 # Number of channels in gated conv. + skip_channels: 64 # Number of channels in skip conv. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. + bias: True # use bias in residual blocks + use_weight_norm: True # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + use_causal_conv: False # use causal conv in residual blocks and upsample layers + upsample_scales: [8, 4, 2, 2] # Upsampling scales. Prodcut of these must be the same as hop size. + interpolate_mode: "nearest" # upsample net interpolate mode + freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis + nonlinear_activation: null + nonlinear_activation_params: {} + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Number of output channels. + layers: 10 # Number of conv layers. + conv_channels: 64 # Number of chnn layers. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. + nonlinear_activation_params: # Nonlinear function parameters + negative_slope: 0.2 # Alpha in leakyrelu. + +########################################################### +# STFT LOSS SETTING # +########################################################### +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 4.0 # Loss balancing coefficient. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 8 # Batch size. +batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift. +num_workers: 1 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-6 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 0.0001 # Generator's learning rate. + step_size: 200000 # Generator's scheduler step size. + gamma: 0.5 # Generator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +generator_grad_norm: 10 # Generator's gradient norm. +discriminator_optimizer_params: + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. +train_max_steps: 400000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/opencpop/voc1/local/PTQ_static.sh b/examples/opencpop/voc1/local/PTQ_static.sh new file mode 120000 index 000000000..247ce5c74 --- /dev/null +++ b/examples/opencpop/voc1/local/PTQ_static.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/PTQ_static.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/local/dygraph_to_static.sh b/examples/opencpop/voc1/local/dygraph_to_static.sh new file mode 100755 index 000000000..40a2c51ba --- /dev/null +++ b/examples/opencpop/voc1/local/dygraph_to_static.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../../dygraph_to_static.py \ + --type=voc \ + --voc=pwgan_opencpop \ + --voc_config=${config_path} \ + --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --voc_stat=dump/train/feats_stats.npy \ + --inference_dir=exp/default/inference/ diff --git a/examples/opencpop/voc1/local/preprocess.sh b/examples/opencpop/voc1/local/preprocess.sh new file mode 100755 index 000000000..edab4d0d5 --- /dev/null +++ b/examples/opencpop/voc1/local/preprocess.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../preprocess.py \ + --rootdir=~/datasets/Opencpop/segments/ \ + --dataset=opencpop \ + --dumpdir=dump \ + --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \ + --config=${config_path} \ + --cut-sil=False \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/opencpop/voc1/local/synthesize.sh b/examples/opencpop/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/opencpop/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/local/train.sh b/examples/opencpop/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/opencpop/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/path.sh b/examples/opencpop/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/opencpop/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/run.sh b/examples/opencpop/voc1/run.sh new file mode 100755 index 000000000..1f87425f4 --- /dev/null +++ b/examples/opencpop/voc1/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_100000.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# dygraph to static +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_opencpop || exit -1 +fi diff --git a/examples/opencpop/voc5/conf/default.yaml b/examples/opencpop/voc5/conf/default.yaml new file mode 100644 index 000000000..10449f860 --- /dev/null +++ b/examples/opencpop/voc5/conf/default.yaml @@ -0,0 +1,167 @@ +# This is the configuration file for CSMSC dataset. +# This configuration is based on HiFiGAN V1, which is an official configuration. +# But I found that the optimizer setting does not work well with my implementation. +# So I changed optimizer settings as follows: +# - AdamW -> Adam +# - betas: [0.8, 0.99] -> betas: [0.5, 0.9] +# - Scheduler: ExponentialLR -> MultiStepLR +# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 80 # Number of input channels. + out_channels: 1 # Number of output channels. + channels: 512 # Number of initial channels. + kernel_size: 7 # Kernel size of initial and final conv layers. + upsample_scales: [8, 4, 2, 2] # Upsampling scales. + upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers. + resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. + resblock_dilations: # Dilations for residual blocks. + - [1, 3, 5] + - [1, 3, 5] + - [1, 3, 5] + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. + nonlinear_activation: "leakyrelu" # Nonlinear activation type. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + scales: 3 # Number of multi-scale discriminator. + scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator. + scale_downsample_pooling_params: + kernel_size: 4 # Pooling kernel size. + stride: 2 # Pooling stride. + padding: 2 # Padding size. + scale_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. + channels: 128 # Initial number of channels. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + max_groups: 16 # Maximum number of groups in downsampling conv layers. + bias: True + downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: True # Whether to follow the official norm setting. + periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. + period_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [5, 3] # List of kernel sizes. + channels: 32 # Initial number of channels. + downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + bias: True # Whether to use bias parameter in conv layer." + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. +mel_loss_params: + fs: 24000 + fft_size: 512 + hop_size: 128 + win_length: 512 + window: "hann" + num_mels: 80 + fmin: 30 + fmax: 12000 + log_base: null +generator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +discriminator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True +feat_match_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_aux: 45.0 # Loss balancing coefficient for STFT loss. +lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss. +lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 16 # Batch size. +batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 1 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 2.0e-4 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +generator_grad_norm: -1 # Generator's gradient norm. +discriminator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 2.0e-4 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +discriminator_grad_norm: -1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2500000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 4 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/opencpop/voc5/conf/finetune.yaml b/examples/opencpop/voc5/conf/finetune.yaml new file mode 100644 index 000000000..0022a67aa --- /dev/null +++ b/examples/opencpop/voc5/conf/finetune.yaml @@ -0,0 +1,168 @@ +# This is the configuration file for CSMSC dataset. +# This configuration is based on HiFiGAN V1, which is an official configuration. +# But I found that the optimizer setting does not work well with my implementation. +# So I changed optimizer settings as follows: +# - AdamW -> Adam +# - betas: [0.8, 0.99] -> betas: [0.5, 0.9] +# - Scheduler: ExponentialLR -> MultiStepLR +# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 80 # Number of input channels. + out_channels: 1 # Number of output channels. + channels: 512 # Number of initial channels. + kernel_size: 7 # Kernel size of initial and final conv layers. + upsample_scales: [8, 4, 2, 2] # Upsampling scales. + upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers. + resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. + resblock_dilations: # Dilations for residual blocks. + - [1, 3, 5] + - [1, 3, 5] + - [1, 3, 5] + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. + nonlinear_activation: "leakyrelu" # Nonlinear activation type. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + scales: 3 # Number of multi-scale discriminator. + scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator. + scale_downsample_pooling_params: + kernel_size: 4 # Pooling kernel size. + stride: 2 # Pooling stride. + padding: 2 # Padding size. + scale_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. + channels: 128 # Initial number of channels. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + max_groups: 16 # Maximum number of groups in downsampling conv layers. + bias: True + downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: True # Whether to follow the official norm setting. + periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. + period_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [5, 3] # List of kernel sizes. + channels: 32 # Initial number of channels. + downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + bias: True # Whether to use bias parameter in conv layer." + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. +mel_loss_params: + fs: 24000 + fft_size: 512 + hop_size: 128 + win_length: 512 + window: "hann" + num_mels: 80 + fmin: 30 + fmax: 12000 + log_base: null +generator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +discriminator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True +feat_match_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_aux: 45.0 # Loss balancing coefficient for STFT loss. +lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss. +lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. + +########################################################### +# DATA LOADER SETTING # +########################################################### +#batch_size: 16 # Batch size. +batch_size: 1 # Batch size. +batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 1 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 2.0e-4 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +generator_grad_norm: -1 # Generator's gradient norm. +discriminator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 2.0e-4 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +discriminator_grad_norm: -1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2600000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 4 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/opencpop/voc5/finetune.sh b/examples/opencpop/voc5/finetune.sh new file mode 100755 index 000000000..76f363295 --- /dev/null +++ b/examples/opencpop/voc5/finetune.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py \ + --diffsinger-config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \ + --diffsinger-checkpoint=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \ + --diffsinger-stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \ + --diffsinger-stretch=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy \ + --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \ + --output-dir=dump_finetune \ + --phones-dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \ + --dataset=opencpop \ + --rootdir=~/datasets/Opencpop/segments/ +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${MAIN_ROOT}/utils/link_wav.py \ + --old-dump-dir=dump \ + --dump-dir=dump_finetune +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + cp dump/train/feats_stats.npy dump_finetune/train/ +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/train/raw/metadata.jsonl \ + --dumpdir=dump_finetune/train/norm \ + --stats=dump_finetune/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/dev/raw/metadata.jsonl \ + --dumpdir=dump_finetune/dev/norm \ + --stats=dump_finetune/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/test/raw/metadata.jsonl \ + --dumpdir=dump_finetune/test/norm \ + --stats=dump_finetune/train/feats_stats.npy +fi + +# create finetune env +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "create finetune env" + python3 local/prepare_env.py \ + --pretrained_model_dir=exp/default/checkpoints/ \ + --output_dir=exp/finetune/ +fi + +# finetune +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + CUDA_VISIBLE_DEVICES=${gpus} \ + FLAGS_cudnn_exhaustive_search=true \ + FLAGS_conv_workspace_size_limit=4000 \ + python ${BIN_DIR}/train.py \ + --train-metadata=dump_finetune/train/norm/metadata.jsonl \ + --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ + --config=conf/finetune.yaml \ + --output-dir=exp/finetune \ + --ngpu=1 +fi diff --git a/examples/opencpop/voc5/local/PTQ_static.sh b/examples/opencpop/voc5/local/PTQ_static.sh new file mode 120000 index 000000000..247ce5c74 --- /dev/null +++ b/examples/opencpop/voc5/local/PTQ_static.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/PTQ_static.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/local/dygraph_to_static.sh b/examples/opencpop/voc5/local/dygraph_to_static.sh new file mode 100755 index 000000000..65077661a --- /dev/null +++ b/examples/opencpop/voc5/local/dygraph_to_static.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../../dygraph_to_static.py \ + --type=voc \ + --voc=hifigan_opencpop \ + --voc_config=${config_path} \ + --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --voc_stat=dump/train/feats_stats.npy \ + --inference_dir=exp/default/inference/ diff --git a/examples/opencpop/voc5/local/prepare_env.py b/examples/opencpop/voc5/local/prepare_env.py new file mode 120000 index 000000000..be03c86b3 --- /dev/null +++ b/examples/opencpop/voc5/local/prepare_env.py @@ -0,0 +1 @@ +../../../other/tts_finetune/tts3/local/prepare_env.py \ No newline at end of file diff --git a/examples/opencpop/voc5/local/preprocess.sh b/examples/opencpop/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/opencpop/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/local/synthesize.sh b/examples/opencpop/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/opencpop/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/local/train.sh b/examples/opencpop/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/opencpop/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/path.sh b/examples/opencpop/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/opencpop/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/run.sh b/examples/opencpop/voc5/run.sh new file mode 100755 index 000000000..290c90d25 --- /dev/null +++ b/examples/opencpop/voc5/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_2500000.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# dygraph to static +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_opencpop || exit -1 +fi diff --git a/examples/other/tn/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt index 17e90d0b6..ba9e6529a 100644 --- a/examples/other/tn/data/textnorm_test_cases.txt +++ b/examples/other/tn/data/textnorm_test_cases.txt @@ -32,7 +32,7 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这 明天有62%的概率降雨|明天有百分之六十二的概率降雨 这是固话0421-33441122|这是固话零四二一三三四四一一二二 这是手机+86 18544139121|这是手机八六一八五四四一三九一二一 -小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五cm,梦想是打篮球!我觉得有百分之零点一的可能性。 +小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五厘米,梦想是打篮球!我觉得有百分之零点一的可能性。 不管三七二十一|不管三七二十一 九九八十一难|九九八十一难 2018年5月23号上午10点10分|二零一八年五月二十三号上午十点十分 @@ -124,4 +124,4 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这 12~23|十二到二十三 12-23|十二到二十三 25cm²|二十五平方厘米 -25m|米 \ No newline at end of file +25m|米 diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh index a112b94b7..76307bd5f 100755 --- a/examples/vctk/tts3/run.sh +++ b/examples/vctk/tts3/run.sh @@ -43,10 +43,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_vctk # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_vctk diff --git a/examples/zh_en_tts/tts3/run.sh b/examples/zh_en_tts/tts3/run.sh index 12f99081a..a4d86480b 100755 --- a/examples/zh_en_tts/tts3/run.sh +++ b/examples/zh_en_tts/tts3/run.sh @@ -46,10 +46,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # install paddle2onnx - version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') - if [[ -z "$version" || ${version} != '1.0.0' ]]; then - pip install paddle2onnx==1.0.0 - fi + pip install paddle2onnx --upgrade ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_mix # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index dd5f08b0b..3c5db64bb 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -102,6 +102,20 @@ ssl_dynamic_pretrained_models = { 'params': 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', }, + '1.4': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz', + 'md5': + '150e51b8ea5d255ccce6b395de8d916a', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/wav2vec2ASR/checkpoints/avg_1', + 'model': + 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', + 'params': + 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', + }, }, } @@ -1644,8 +1658,8 @@ tts_static_pretrained_models["pwgan_male-en"] = tts_static_pretrained_models[ "pwgan_male-mix"] = tts_static_pretrained_models["pwgan_male-zh"] tts_static_pretrained_models["hifigan_male-en"] = tts_static_pretrained_models[ "hifigan_male-mix"] = tts_static_pretrained_models["hifigan_male-zh"] -tts_static_pretrained_models["pwgan_aishell3-canton"] = tts_static_pretrained_models[ - "pwgan_aishell3-zh"] +tts_static_pretrained_models[ + "pwgan_aishell3-canton"] = tts_static_pretrained_models["pwgan_aishell3-zh"] tts_onnx_pretrained_models = { # speedyspeech @@ -1979,8 +1993,9 @@ tts_onnx_pretrained_models["pwgan_male_onnx-en"] = tts_onnx_pretrained_models[ tts_onnx_pretrained_models["hifigan_male_onnx-en"] = tts_onnx_pretrained_models[ "hifigan_male_onnx-mix"] = tts_onnx_pretrained_models[ "hifigan_male_onnx-zh"] -tts_onnx_pretrained_models["pwgan_aishell3_onnx-canton"] = tts_onnx_pretrained_models[ - "pwgan_aishell3_onnx-zh"] +tts_onnx_pretrained_models[ + "pwgan_aishell3_onnx-canton"] = tts_onnx_pretrained_models[ + "pwgan_aishell3_onnx-zh"] # --------------------------------- # ------------ Vector ------------- @@ -2058,10 +2073,10 @@ rhy_frontend_models = { # --------------------------------- StarGANv2VC_source = { - '1.0' :{ - 'url': 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/StarGANv2VC_source.zip', - 'md5': '195e169419163f5648030ba84c71f866', - + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/StarGANv2VC_source.zip', + 'md5': + '195e169419163f5648030ba84c71f866', } } - diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py index 0d66ac410..0295713ff 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from paddlenlp.transformers import AutoTokenizer from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -33,9 +34,15 @@ class Wav2vec2Infer(): self.args = args self.config = config self.audio_file = args.audio_file + self.tokenizer = config.get("tokenizer", None) + + if self.tokenizer: + self.text_feature = AutoTokenizer.from_pretrained( + self.config.tokenizer) + else: + self.text_feature = TextFeaturizer( + unit_type=config.unit_type, vocab=config.vocab_filepath) - self.text_feature = TextFeaturizer( - unit_type=config.unit_type, vocab=config.vocab_filepath) paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model @@ -59,14 +66,14 @@ class Wav2vec2Infer(): audio, _ = soundfile.read( self.audio_file, dtype="int16", always_2d=True) logger.info(f"audio shape: {audio.shape}") - xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode result_transcripts, result_tokenids = self.model.decode( xs, text_feature=self.text_feature, decoding_method=decode_config.decoding_method, - beam_size=decode_config.beam_size) + beam_size=decode_config.beam_size, + tokenizer=self.tokenizer, ) rsl = result_transcripts[0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py index 86b56b876..6c90f99e1 100644 --- a/paddlespeech/s2t/exps/wav2vec2/model.py +++ b/paddlespeech/s2t/exps/wav2vec2/model.py @@ -591,7 +591,7 @@ class Wav2Vec2ASRTrainer(Trainer): def setup_dataloader(self): config = self.config.clone() self.use_streamdata = config.get("use_stream_data", False) - self.use_sb = config.use_sb_pipeline + self.use_sb = config.get("use_sb_pipeline", False) if self.use_sb: hparams_file = config.sb_pipeline_conf with open(hparams_file, 'r', encoding='utf8') as fin: diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 544c1e836..6494b5304 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -43,6 +43,7 @@ from paddlespeech.s2t.modules.ctc import CTCDecoderBase from paddlespeech.s2t.modules.decoder import BiTransformerDecoder from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder +from paddlespeech.s2t.modules.encoder import SqueezeformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder from paddlespeech.s2t.modules.initializer import DefaultInitializerContext from paddlespeech.s2t.modules.loss import LabelSmoothingLoss @@ -905,6 +906,9 @@ class U2Model(U2DecodeModel): elif encoder_type == 'conformer': encoder = ConformerEncoder( input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) + elif encoder_type == 'squeezeformer': + encoder = SqueezeformerEncoder( + input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) else: raise ValueError(f"not support encoder type:{encoder_type}") diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index d9568dcc9..14336c03d 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -200,7 +200,12 @@ class MultiHeadedAttention(nn.Layer): class RelPositionMultiHeadedAttention(MultiHeadedAttention): """Multi-Head Attention layer with relative position encoding.""" - def __init__(self, n_head, n_feat, dropout_rate): + def __init__(self, + n_head, + n_feat, + dropout_rate, + adaptive_scale=False, + init_weights=False): """Construct an RelPositionMultiHeadedAttention object. Paper: https://arxiv.org/abs/1901.02860 Args: @@ -223,6 +228,39 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): pos_bias_v = self.create_parameter( (self.h, self.d_k), default_initializer=I.XavierUniform()) self.add_parameter('pos_bias_v', pos_bias_v) + self.adaptive_scale = adaptive_scale + if self.adaptive_scale: + ada_scale = self.create_parameter( + [1, 1, n_feat], default_initializer=I.Constant(1.0)) + self.add_parameter('ada_scale', ada_scale) + ada_bias = self.create_parameter( + [1, 1, n_feat], default_initializer=I.Constant(0.0)) + self.add_parameter('ada_bias', ada_bias) + if init_weights: + self.init_weights() + + def init_weights(self): + input_max = (self.h * self.d_k)**-0.5 + self.linear_q._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_q._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_k._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_k._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_v._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_v._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_pos._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_pos._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_out._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_out._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) def rel_shift(self, x, zero_triu: bool=False): """Compute relative positinal encoding. @@ -273,6 +311,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): where `cache_t == chunk_size * num_decoding_left_chunks` and `head * d_k == size` """ + if self.adaptive_scale: + query = self.ada_scale * query + self.ada_bias + key = self.ada_scale * key + self.ada_bias + value = self.ada_scale * value + self.ada_bias + q, k, v = self.forward_qkv(query, key, value) # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 09d903eee..7a0c72f3b 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -18,6 +18,7 @@ from typing import Tuple import paddle from paddle import nn +from paddle.nn import initializer as I from typeguard import check_argument_types from paddlespeech.s2t.modules.align import BatchNorm1D @@ -39,7 +40,9 @@ class ConvolutionModule(nn.Layer): activation: nn.Layer=nn.ReLU(), norm: str="batch_norm", causal: bool=False, - bias: bool=True): + bias: bool=True, + adaptive_scale: bool=False, + init_weights: bool=False): """Construct an ConvolutionModule object. Args: channels (int): The number of channels of conv layers. @@ -51,6 +54,18 @@ class ConvolutionModule(nn.Layer): """ assert check_argument_types() super().__init__() + self.bias = bias + self.channels = channels + self.kernel_size = kernel_size + self.adaptive_scale = adaptive_scale + if self.adaptive_scale: + ada_scale = self.create_parameter( + [1, 1, channels], default_initializer=I.Constant(1.0)) + self.add_parameter('ada_scale', ada_scale) + ada_bias = self.create_parameter( + [1, 1, channels], default_initializer=I.Constant(0.0)) + self.add_parameter('ada_bias', ada_bias) + self.pointwise_conv1 = Conv1D( channels, 2 * channels, @@ -105,6 +120,28 @@ class ConvolutionModule(nn.Layer): ) self.activation = activation + if init_weights: + self.init_weights() + + def init_weights(self): + pw_max = self.channels**-0.5 + dw_max = self.kernel_size**-0.5 + self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + if self.bias: + self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + if self.bias: + self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + if self.bias: + self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + def forward( self, x: paddle.Tensor, @@ -123,6 +160,9 @@ class ConvolutionModule(nn.Layer): paddle.Tensor: Output tensor (#batch, time, channels). paddle.Tensor: Output cache tensor (#batch, channels, time') """ + if self.adaptive_scale: + x = self.ada_scale * x + self.ada_bias + # exchange the temporal dimension and the feature dimension x = x.transpose([0, 2, 1]) # [B, C, T] diff --git a/paddlespeech/s2t/modules/conv2d.py b/paddlespeech/s2t/modules/conv2d.py new file mode 100644 index 000000000..ca6e136ad --- /dev/null +++ b/paddlespeech/s2t/modules/conv2d.py @@ -0,0 +1,62 @@ +from typing import Optional +from typing import Union + +import paddle +import paddle.nn.functional as F +from paddle.nn.layer.conv import _ConvNd + +__all__ = ['Conv2DValid'] + + +class Conv2DValid(_ConvNd): + """ + Conv2d operator for VALID mode padding. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int=1, + padding: Union[str, int]=0, + dilation: int=1, + groups: int=1, + padding_mode: str='zeros', + weight_attr=None, + bias_attr=None, + data_format="NCHW", + valid_trigx: bool=False, + valid_trigy: bool=False) -> None: + super(Conv2DValid, self).__init__( + in_channels, + out_channels, + kernel_size, + False, + 2, + stride=stride, + padding=padding, + padding_mode=padding_mode, + dilation=dilation, + groups=groups, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format) + self.valid_trigx = valid_trigx + self.valid_trigy = valid_trigy + + def _conv_forward(self, + input: paddle.Tensor, + weight: paddle.Tensor, + bias: Optional[paddle.Tensor]): + validx, validy = 0, 0 + if self.valid_trigx: + validx = (input.shape[-2] * + (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2 + if self.valid_trigy: + validy = (input.shape[-1] * + (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2 + return F.conv2d(input, weight, bias, self._stride, (validx, validy), + self._dilation, self._groups) + + def forward(self, input: paddle.Tensor) -> paddle.Tensor: + return self._conv_forward(input, self.weight, self.bias) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index fd7bd7b9a..d90d69d77 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -14,7 +14,10 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) """Encoder definition.""" +from typing import List +from typing import Optional from typing import Tuple +from typing import Union import paddle from paddle import nn @@ -22,6 +25,7 @@ from typeguard import check_argument_types from paddlespeech.s2t.modules.activation import get_activation from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.attention import MultiHeadedAttention from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule @@ -29,6 +33,7 @@ from paddlespeech.s2t.modules.embedding import NoPositionalEncoding from paddlespeech.s2t.modules.embedding import PositionalEncoding from paddlespeech.s2t.modules.embedding import RelPositionalEncoding from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer +from paddlespeech.s2t.modules.encoder_layer import SqueezeformerEncoderLayer from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer from paddlespeech.s2t.modules.mask import add_optional_chunk_mask from paddlespeech.s2t.modules.mask import make_non_pad_mask @@ -36,12 +41,19 @@ from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedF from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8 +from paddlespeech.s2t.modules.subsampling import DepthwiseConv2DSubsampling4 from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling +from paddlespeech.s2t.modules.time_reduction import TimeReductionLayer1D +from paddlespeech.s2t.modules.time_reduction import TimeReductionLayer2D +from paddlespeech.s2t.modules.time_reduction import TimeReductionLayerStream from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"] +__all__ = [ + "BaseEncoder", 'TransformerEncoder', "ConformerEncoder", + "SqueezeformerEncoder" +] class BaseEncoder(nn.Layer): @@ -487,3 +499,366 @@ class ConformerEncoder(BaseEncoder): normalize_before=normalize_before, concat_after=concat_after) for _ in range(num_blocks) ]) + + +class SqueezeformerEncoder(nn.Layer): + def __init__(self, + input_size: int, + encoder_dim: int=256, + output_size: int=256, + attention_heads: int=4, + num_blocks: int=12, + reduce_idx: Optional[Union[int, List[int]]]=5, + recover_idx: Optional[Union[int, List[int]]]=11, + feed_forward_expansion_factor: int=4, + dw_stride: bool=False, + input_dropout_rate: float=0.1, + pos_enc_layer_type: str="rel_pos", + time_reduction_layer_type: str="conv1d", + feed_forward_dropout_rate: float=0.1, + attention_dropout_rate: float=0.1, + cnn_module_kernel: int=31, + cnn_norm_type: str="layer_norm", + dropout: float=0.1, + causal: bool=False, + adaptive_scale: bool=True, + activation_type: str="swish", + init_weights: bool=True, + global_cmvn: paddle.nn.Layer=None, + normalize_before: bool=False, + use_dynamic_chunk: bool=False, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_left_chunk: bool=False): + """Construct SqueezeformerEncoder + + Args: + input_size to use_dynamic_chunk, see in Transformer BaseEncoder. + encoder_dim (int): The hidden dimension of encoder layer. + output_size (int): The output dimension of final projection layer. + attention_heads (int): Num of attention head in attention module. + num_blocks (int): Num of encoder layers. + reduce_idx Optional[Union[int, List[int]]]: + reduce layer index, from 40ms to 80ms per frame. + recover_idx Optional[Union[int, List[int]]]: + recover layer index, from 80ms to 40ms per frame. + feed_forward_expansion_factor (int): Enlarge coefficient of FFN. + dw_stride (bool): Whether do depthwise convolution + on subsampling module. + input_dropout_rate (float): Dropout rate of input projection layer. + pos_enc_layer_type (str): Self attention type. + time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. + cnn_module_kernel (int): Kernel size of CNN module. + activation_type (str): Encoder activation function type. + cnn_module_kernel (int): Kernel size of convolution module. + adaptive_scale (bool): Whether to use adaptive scale. + init_weights (bool): Whether to initialize weights. + causal (bool): whether to use causal convolution or not. + """ + assert check_argument_types() + super().__init__() + self.global_cmvn = global_cmvn + self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ + if type(reduce_idx) == int else reduce_idx + self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ + if type(recover_idx) == int else recover_idx + self.check_ascending_list() + if reduce_idx is None: + self.time_reduce = None + else: + if recover_idx is None: + self.time_reduce = 'normal' # no recovery at the end + else: + self.time_reduce = 'recover' # recovery at the end + assert len(self.reduce_idx) == len(self.recover_idx) + self.reduce_stride = 2 + self._output_size = output_size + self.normalize_before = normalize_before + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + activation = get_activation(activation_type) + + # self-attention module definition + if pos_enc_layer_type != "rel_pos": + encoder_selfattn_layer = MultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, output_size, + attention_dropout_rate) + else: + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, encoder_dim, + attention_dropout_rate, + adaptive_scale, init_weights) + + # feed-forward module definition + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + encoder_dim, encoder_dim * feed_forward_expansion_factor, + feed_forward_dropout_rate, activation, adaptive_scale, init_weights) + + # convolution module definition + convolution_layer = ConvolutionModule + convolution_layer_args = (encoder_dim, cnn_module_kernel, activation, + cnn_norm_type, causal, True, adaptive_scale, + init_weights) + + self.embed = DepthwiseConv2DSubsampling4( + 1, encoder_dim, + RelPositionalEncoding(encoder_dim, dropout_rate=0.1), dw_stride, + input_size, input_dropout_rate, init_weights) + + self.preln = LayerNorm(encoder_dim) + self.encoders = paddle.nn.LayerList([ + SqueezeformerEncoderLayer( + encoder_dim, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + convolution_layer(*convolution_layer_args), + positionwise_layer(*positionwise_layer_args), normalize_before, + dropout, concat_after) for _ in range(num_blocks) + ]) + if time_reduction_layer_type == 'conv1d': + time_reduction_layer = TimeReductionLayer1D + time_reduction_layer_args = { + 'channel': encoder_dim, + 'out_dim': encoder_dim, + } + elif time_reduction_layer_type == 'stream': + time_reduction_layer = TimeReductionLayerStream + time_reduction_layer_args = { + 'channel': encoder_dim, + 'out_dim': encoder_dim, + } + else: + time_reduction_layer = TimeReductionLayer2D + time_reduction_layer_args = {'encoder_dim': encoder_dim} + + self.time_reduction_layer = time_reduction_layer( + **time_reduction_layer_args) + self.time_recover_layer = Linear(encoder_dim, encoder_dim) + self.final_proj = None + if output_size != encoder_dim: + self.final_proj = Linear(encoder_dim, output_size) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs: paddle.Tensor, + xs_lens: paddle.Tensor, + decoding_chunk_size: int=0, + num_decoding_left_chunks: int=-1, + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Embed positions in tensor. + Args: + xs: padded input tensor (B, L, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor, lens and mask + """ + masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L) + + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, masks = self.embed(xs, masks) + mask_pad = masks + chunk_masks = add_optional_chunk_mask( + xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, + decoding_chunk_size, self.static_chunk_size, + num_decoding_left_chunks) + xs_lens = chunk_masks.squeeze(1).sum(1) + xs = self.preln(xs) + recover_activations: \ + List[Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]] = [] + index = 0 + for i, layer in enumerate(self.encoders): + if self.reduce_idx is not None: + if self.time_reduce is not None and i in self.reduce_idx: + recover_activations.append( + (xs, chunk_masks, pos_emb, mask_pad)) + xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer( + xs, xs_lens, chunk_masks, mask_pad) + pos_emb = pos_emb[:, ::2, :] + index += 1 + + if self.recover_idx is not None: + if self.time_reduce == 'recover' and i in self.recover_idx: + index -= 1 + recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[ + index] + # recover output length for ctc decode + xs = paddle.repeat_interleave(xs, repeats=2, axis=1) + xs = self.time_recover_layer(xs) + recoverd_t = recover_tensor.shape[1] + xs = recover_tensor + xs[:, :recoverd_t, :] + chunk_masks = recover_chunk_masks + pos_emb = recover_pos_emb + mask_pad = recover_mask_pad + + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + + if self.final_proj is not None: + xs = self.final_proj(xs) + return xs, masks + + def check_ascending_list(self): + if self.reduce_idx is not None: + assert self.reduce_idx == sorted(self.reduce_idx), \ + "reduce_idx should be int or ascending list" + if self.recover_idx is not None: + assert self.recover_idx == sorted(self.recover_idx), \ + "recover_idx should be int or ascending list" + + def calculate_downsampling_factor(self, i: int) -> int: + if self.reduce_idx is None: + return 1 + else: + reduce_exp, recover_exp = 0, 0 + for exp, rd_idx in enumerate(self.reduce_idx): + if i >= rd_idx: + reduce_exp = exp + 1 + if self.recover_idx is not None: + for exp, rc_idx in enumerate(self.recover_idx): + if i >= rc_idx: + recover_exp = exp + 1 + return int(2**(reduce_exp - recover_exp)) + + def forward_chunk( + self, + xs: paddle.Tensor, + offset: int, + required_cache_size: int, + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """ Forward just one chunk + + Args: + xs (paddle.Tensor): chunk input, with shape (b=1, time, mel-dim), + where `time == (chunk_size - 1) * subsample_rate + \ + subsample.right_context + 1` + offset (int): current offset in encoder output time stamp + required_cache_size (int): cache size required for next chunk + compuation + >=0: actual cache size + <0: means all history cache is required + att_cache (paddle.Tensor): cache tensor for KEY & VALUE in + transformer/conformer attention, with shape + (elayers, head, cache_t1, d_k * 2), where + `head * d_k == hidden-dim` and + `cache_t1 == chunk_size * num_decoding_left_chunks`. + cnn_cache (paddle.Tensor): cache tensor for cnn_module in conformer, + (elayers, b=1, hidden-dim, cache_t2), where + `cache_t2 == cnn.lorder - 1` + + Returns: + paddle.Tensor: output of current input xs, + with shape (b=1, chunk_size, hidden-dim). + paddle.Tensor: new attention cache required for next chunk, with + dynamic shape (elayers, head, ?, d_k * 2) + depending on required_cache_size. + paddle.Tensor: new conformer cnn cache required for next chunk, with + same shape as the original cnn_cache. + """ + assert xs.shape[0] == 1 # batch size must be one + + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + + # tmp_masks is just for interface compatibility, [B=1, C=1, T] + tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool) + # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) + + # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) + elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2] + chunk_size = xs.shape[1] + attention_key_size = cache_t1 + chunk_size + pos_emb = self.embed.position_encoding( + offset=offset - cache_t1, size=attention_key_size) + if required_cache_size < 0: + next_cache_start = 0 + elif required_cache_size == 0: + next_cache_start = attention_key_size + else: + next_cache_start = max(attention_key_size - required_cache_size, 0) + + r_att_cache = [] + r_cnn_cache = [] + + mask_pad = paddle.ones([1, xs.shape[1]], dtype=paddle.bool) + mask_pad = mask_pad.unsqueeze(1) + max_att_len: int = 0 + recover_activations: \ + List[Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]] = [] + index = 0 + xs_lens = paddle.to_tensor([xs.shape[1]], dtype=paddle.int32) + xs = self.preln(xs) + for i, layer in enumerate(self.encoders): + # NOTE(xcsong): Before layer.forward + # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), + # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) + if self.reduce_idx is not None: + if self.time_reduce is not None and i in self.reduce_idx: + recover_activations.append( + (xs, att_mask, pos_emb, mask_pad)) + xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer( + xs, xs_lens, att_mask, mask_pad) + pos_emb = pos_emb[:, ::2, :] + index += 1 + + if self.recover_idx is not None: + if self.time_reduce == 'recover' and i in self.recover_idx: + index -= 1 + recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[ + index] + # recover output length for ctc decode + xs = paddle.repeat_interleave(xs, repeats=2, axis=1) + xs = self.time_recover_layer(xs) + recoverd_t = recover_tensor.shape[1] + xs = recover_tensor + xs[:, :recoverd_t, :] + att_mask = recover_att_mask + pos_emb = recover_pos_emb + mask_pad = recover_mask_pad + + factor = self.calculate_downsampling_factor(i) + att_cache1 = att_cache[ + i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[ + 1], :] + cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache + xs, _, new_att_cache, new_cnn_cache = layer( + xs, + att_mask, + pos_emb, + att_cache=att_cache1, + cnn_cache=cnn_cache1) + # NOTE(xcsong): After layer.forward + # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), + # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) + cached_att = new_att_cache[:, :, next_cache_start // factor:, :] + cached_cnn = new_cnn_cache.unsqueeze(0) + cached_att = cached_att.repeat_interleave(repeats=factor, axis=2) + if i == 0: + # record length for the first block as max length + max_att_len = cached_att.shape[2] + r_att_cache.append(cached_att[:, :, :max_att_len, :]) + r_cnn_cache.append(cached_cnn) + # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), + # ? may be larger than cache_t1, it depends on required_cache_size + r_att_cache = paddle.concat(r_att_cache, axis=0) + # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) + r_cnn_cache = paddle.concat(r_cnn_cache, axis=0) + + if self.final_proj is not None: + xs = self.final_proj(xs) + return xs, r_att_cache, r_cnn_cache diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index dac62bce3..ecba95e85 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer"] +__all__ = [ + "TransformerEncoderLayer", "ConformerEncoderLayer", + "SqueezeformerEncoderLayer" +] class TransformerEncoderLayer(nn.Layer): @@ -276,3 +279,125 @@ class ConformerEncoderLayer(nn.Layer): x = self.norm_final(x) return x, mask, new_att_cache, new_cnn_cache + + +class SqueezeformerEncoderLayer(nn.Layer): + """Encoder layer module.""" + + def __init__(self, + size: int, + self_attn: paddle.nn.Layer, + feed_forward1: Optional[nn.Layer]=None, + conv_module: Optional[nn.Layer]=None, + feed_forward2: Optional[nn.Layer]=None, + normalize_before: bool=False, + dropout_rate: float=0.1, + concat_after: bool=False): + """Construct an EncoderLayer object. + + Args: + size (int): Input dimension. + self_attn (paddle.nn.Layer): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + instance can be used as the argument. + feed_forward1 (paddle.nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + conv_module (paddle.nn.Layer): Convolution module instance. + `ConvlutionLayer` instance can be used as the argument. + feed_forward2 (paddle.nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: use layer_norm after each sub-block. + """ + super().__init__() + self.size = size + self.self_attn = self_attn + self.layer_norm1 = LayerNorm(size) + self.ffn1 = feed_forward1 + self.layer_norm2 = LayerNorm(size) + self.conv_module = conv_module + self.layer_norm3 = LayerNorm(size) + self.ffn2 = feed_forward2 + self.layer_norm4 = LayerNorm(size) + self.normalize_before = normalize_before + self.dropout = nn.Dropout(dropout_rate) + self.concat_after = concat_after + if concat_after: + self.concat_linear = Linear(size + size, size) + else: + self.concat_linear = nn.Identity() + + def forward( + self, + x: paddle.Tensor, + mask: paddle.Tensor, + pos_emb: paddle.Tensor, + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """Compute encoded features. + Args: + x (paddle.Tensor): Input tensor (#batch, time, size). + mask (paddle.Tensor): Mask tensor for the input (#batch, time, time). + (0,0,0) means fake mask. + pos_emb (paddle.Tensor): postional encoding, must not be None + for ConformerEncoderLayer + mask_pad (paddle.Tensor): batch padding mask used for conv module. + (#batch, 1,time), (0, 0, 0) means fake mask. + att_cache (paddle.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (paddle.Tensor): Convolution cache in conformer layer + (1, #batch=1, size, cache_t2). First dim will not be used, just + for dy2st. + Returns: + paddle.Tensor: Output tensor (#batch, time, size). + paddle.Tensor: Mask tensor (#batch, time, time). + paddle.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + paddle.Tensor: cnn_cahce tensor (#batch, size, cache_t2). + """ + # self attention module + residual = x + if self.normalize_before: + x = self.layer_norm1(x) + x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) + if self.concat_after: + x_concat = paddle.concat((x, x_att), axis=-1) + x = residual + self.concat_linear(x_concat) + else: + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.layer_norm1(x) + + # ffn module + residual = x + if self.normalize_before: + x = self.layer_norm2(x) + x = self.ffn1(x) + x = residual + self.dropout(x) + if not self.normalize_before: + x = self.layer_norm2(x) + + # conv module + residual = x + if self.normalize_before: + x = self.layer_norm3(x) + x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) + x = residual + self.dropout(x) + if not self.normalize_before: + x = self.layer_norm3(x) + + # ffn module + residual = x + if self.normalize_before: + x = self.layer_norm4(x) + x = self.ffn2(x) + # we do not use dropout here since it is inside feed forward function + x = residual + self.dropout(x) + if not self.normalize_before: + x = self.layer_norm4(x) + + return x, mask, new_att_cache, new_cnn_cache diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py index c2725dc5c..9ebd5d638 100644 --- a/paddlespeech/s2t/modules/positionwise_feed_forward.py +++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py @@ -16,6 +16,7 @@ """Positionwise feed forward layer definition.""" import paddle from paddle import nn +from paddle.nn import initializer as I from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log @@ -32,7 +33,9 @@ class PositionwiseFeedForward(nn.Layer): idim: int, hidden_units: int, dropout_rate: float, - activation: nn.Layer=nn.ReLU()): + activation: nn.Layer=nn.ReLU(), + adaptive_scale: bool=False, + init_weights: bool=False): """Construct a PositionwiseFeedForward object. FeedForward are appied on each position of the sequence. @@ -45,10 +48,35 @@ class PositionwiseFeedForward(nn.Layer): activation (paddle.nn.Layer): Activation function """ super().__init__() + self.idim = idim + self.hidden_units = hidden_units self.w_1 = Linear(idim, hidden_units) self.activation = activation self.dropout = nn.Dropout(dropout_rate) self.w_2 = Linear(hidden_units, idim) + self.adaptive_scale = adaptive_scale + if self.adaptive_scale: + ada_scale = self.create_parameter( + [1, 1, idim], default_initializer=I.XavierUniform()) + self.add_parameter('ada_scale', ada_scale) + ada_bias = self.create_parameter( + [1, 1, idim], default_initializer=I.XavierUniform()) + self.add_parameter('ada_bias', ada_bias) + + if init_weights: + self.init_weights() + + def init_weights(self): + ffn1_max = self.idim**-0.5 + ffn2_max = self.hidden_units**-0.5 + self.w_1._param_attr = paddle.nn.initializer.Uniform( + low=-ffn1_max, high=ffn1_max) + self.w_1._bias_attr = paddle.nn.initializer.Uniform( + low=-ffn1_max, high=ffn1_max) + self.w_2._param_attr = paddle.nn.initializer.Uniform( + low=-ffn2_max, high=ffn2_max) + self.w_2._bias_attr = paddle.nn.initializer.Uniform( + low=-ffn2_max, high=ffn2_max) def forward(self, xs: paddle.Tensor) -> paddle.Tensor: """Forward function. @@ -57,4 +85,6 @@ class PositionwiseFeedForward(nn.Layer): Returns: output tensor, (B, Lmax, D) """ + if self.adaptive_scale: + xs = self.ada_scale * xs + self.ada_bias return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 782a437ee..ef60bdf0a 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -29,7 +29,7 @@ logger = Log(__name__).getlog() __all__ = [ "LinearNoSubsampling", "Conv2dSubsampling4", "Conv2dSubsampling6", - "Conv2dSubsampling8" + "Conv2dSubsampling8", "DepthwiseConv2DSubsampling4" ] @@ -249,3 +249,67 @@ class Conv2dSubsampling8(Conv2dSubsampling): x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2] + + +class DepthwiseConv2DSubsampling4(BaseSubsampling): + """Depthwise Convolutional 2D subsampling (to 1/4 length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + pos_enc_class (nn.Layer): position encoding class. + dw_stride (int): Whether do depthwise convolution. + input_size (int): filter bank dimension. + + """ + + def __init__(self, + idim: int, + odim: int, + pos_enc_class: nn.Layer, + dw_stride: bool=False, + input_size: int=80, + input_dropout_rate: float=0.1, + init_weights: bool=True): + super(DepthwiseConv2DSubsampling4, self).__init__() + self.idim = idim + self.odim = odim + self.pw_conv = Conv2D( + in_channels=idim, out_channels=odim, kernel_size=3, stride=2) + self.act1 = nn.ReLU() + self.dw_conv = Conv2D( + in_channels=odim, + out_channels=odim, + kernel_size=3, + stride=2, + groups=odim if dw_stride else 1) + self.act2 = nn.ReLU() + self.pos_enc = pos_enc_class + self.input_proj = nn.Sequential( + Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim), + nn.Dropout(p=input_dropout_rate)) + if init_weights: + linear_max = (odim * input_size / 4)**-0.5 + self.input_proj.state_dict()[ + '0.weight'] = paddle.nn.initializer.Uniform( + low=-linear_max, high=linear_max) + self.input_proj.state_dict()[ + '0.bias'] = paddle.nn.initializer.Uniform( + low=-linear_max, high=linear_max) + + self.subsampling_rate = 4 + # 6 = (3 - 1) * 1 + (3 - 1) * 2 + self.right_context = 6 + + def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0 + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + x = x.unsqueeze(1) # (b, c=1, t, f) + x = self.pw_conv(x) + x = self.act1(x) + x = self.dw_conv(x) + x = self.act2(x) + b, c, t, f = x.shape + x = x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f]) + x, pos_emb = self.pos_enc(x, offset) + x = self.input_proj(x) + return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] diff --git a/paddlespeech/s2t/modules/time_reduction.py b/paddlespeech/s2t/modules/time_reduction.py new file mode 100644 index 000000000..d3393f108 --- /dev/null +++ b/paddlespeech/s2t/modules/time_reduction.py @@ -0,0 +1,263 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from wenet(https://github.com/wenet-e2e/wenet) +"""Subsampling layer definition.""" +from typing import Tuple + +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.s2t import masked_fill +from paddlespeech.s2t.modules.align import Conv1D +from paddlespeech.s2t.modules.conv2d import Conv2DValid +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = [ + "TimeReductionLayerStream", "TimeReductionLayer1D", "TimeReductionLayer2D" +] + + +class TimeReductionLayer1D(nn.Layer): + """ + Modified NeMo, + Squeezeformer Time Reduction procedure. + Downsamples the audio by `stride` in the time dimension. + Args: + channel (int): input dimension of + MultiheadAttentionMechanism and PositionwiseFeedForward + out_dim (int): Output dimension of the module. + kernel_size (int): Conv kernel size for + depthwise convolution in convolution module + stride (int): Downsampling factor in time dimension. + """ + + def __init__(self, + channel: int, + out_dim: int, + kernel_size: int=5, + stride: int=2): + super(TimeReductionLayer1D, self).__init__() + + self.channel = channel + self.out_dim = out_dim + self.kernel_size = kernel_size + self.stride = stride + self.padding = max(0, self.kernel_size - self.stride) + + self.dw_conv = Conv1D( + in_channels=channel, + out_channels=channel, + kernel_size=kernel_size, + stride=stride, + padding=self.padding, + groups=channel, ) + + self.pw_conv = Conv1D( + in_channels=channel, + out_channels=out_dim, + kernel_size=1, + stride=1, + padding=0, + groups=1, ) + + self.init_weights() + + def init_weights(self): + dw_max = self.kernel_size**-0.5 + pw_max = self.channel**-0.5 + self.dw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.dw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.pw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + + def forward( + self, + xs, + xs_lens: paddle.Tensor, + mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), + mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), + dtype=paddle.bool), ): + xs = xs.transpose([0, 2, 1]) # [B, C, T] + xs = masked_fill(xs, mask_pad.equal(0), 0.0) + + xs = self.dw_conv(xs) + xs = self.pw_conv(xs) + + xs = xs.transpose([0, 2, 1]) # [B, T, C] + + B, T, D = xs.shape + mask = mask[:, ::self.stride, ::self.stride] + mask_pad = mask_pad[:, :, ::self.stride] + L = mask_pad.shape[-1] + # For JIT exporting, we remove F.pad operator. + if L - T < 0: + xs = xs[:, :L - T, :] + else: + dummy_pad = paddle.zeros([B, L - T, D], dtype=paddle.float32) + xs = paddle.concat([xs, dummy_pad], axis=1) + + xs_lens = (xs_lens + 1) // 2 + return xs, xs_lens, mask, mask_pad + + +class TimeReductionLayer2D(nn.Layer): + def __init__(self, kernel_size: int=5, stride: int=2, encoder_dim: int=256): + super(TimeReductionLayer2D, self).__init__() + self.encoder_dim = encoder_dim + self.kernel_size = kernel_size + self.dw_conv = Conv2DValid( + in_channels=encoder_dim, + out_channels=encoder_dim, + kernel_size=(kernel_size, 1), + stride=stride, + valid_trigy=True) + self.pw_conv = Conv2DValid( + in_channels=encoder_dim, + out_channels=encoder_dim, + kernel_size=1, + stride=1, + valid_trigx=False, + valid_trigy=False) + + self.kernel_size = kernel_size + self.stride = stride + self.init_weights() + + def init_weights(self): + dw_max = self.kernel_size**-0.5 + pw_max = self.encoder_dim**-0.5 + self.dw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.dw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.pw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + + def forward( + self, + xs: paddle.Tensor, + xs_lens: paddle.Tensor, + mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), + mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: + xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0) + xs = xs.unsqueeze(1) + padding1 = self.kernel_size - self.stride + xs = F.pad( + xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.) + xs = self.dw_conv(xs.transpose([0, 3, 2, 1])) + xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1) + tmp_length = xs.shape[1] + xs_lens = (xs_lens + 1) // 2 + padding2 = max(0, (xs_lens.max() - tmp_length).item()) + batch_size, hidden = xs.shape[0], xs.shape[-1] + dummy_pad = paddle.zeros( + [batch_size, padding2, hidden], dtype=paddle.float32) + xs = paddle.concat([xs, dummy_pad], axis=1) + mask = mask[:, ::2, ::2] + mask_pad = mask_pad[:, :, ::2] + return xs, xs_lens, mask, mask_pad + + +class TimeReductionLayerStream(nn.Layer): + """ + Squeezeformer Time Reduction procedure. + Downsamples the audio by `stride` in the time dimension. + Args: + channel (int): input dimension of + MultiheadAttentionMechanism and PositionwiseFeedForward + out_dim (int): Output dimension of the module. + kernel_size (int): Conv kernel size for + depthwise convolution in convolution module + stride (int): Downsampling factor in time dimension. + """ + + def __init__(self, + channel: int, + out_dim: int, + kernel_size: int=1, + stride: int=2): + super(TimeReductionLayerStream, self).__init__() + + self.channel = channel + self.out_dim = out_dim + self.kernel_size = kernel_size + self.stride = stride + + self.dw_conv = Conv1D( + in_channels=channel, + out_channels=channel, + kernel_size=kernel_size, + stride=stride, + padding=0, + groups=channel) + + self.pw_conv = Conv1D( + in_channels=channel, + out_channels=out_dim, + kernel_size=1, + stride=1, + padding=0, + groups=1) + self.init_weights() + + def init_weights(self): + dw_max = self.kernel_size**-0.5 + pw_max = self.channel**-0.5 + self.dw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.dw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.pw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + + def forward( + self, + xs, + xs_lens: paddle.Tensor, + mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)): + xs = xs.transpose([0, 2, 1]) # [B, C, T] + xs = masked_fill(xs, mask_pad.equal(0), 0.0) + + xs = self.dw_conv(xs) + xs = self.pw_conv(xs) + + xs = xs.transpose([0, 2, 1]) # [B, T, C] + + B, T, D = xs.shape + mask = mask[:, ::self.stride, ::self.stride] + mask_pad = mask_pad[:, :, ::self.stride] + L = mask_pad.shape[-1] + # For JIT exporting, we remove F.pad operator. + if L - T < 0: + xs = xs[:, :L - T, :] + else: + dummy_pad = paddle.zeros([B, L - T, D], dtype=paddle.float32) + xs = paddle.concat([xs, dummy_pad], axis=1) + + xs_lens = (xs_lens + 1) // 2 + return xs, xs_lens, mask, mask_pad diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py index 53c756ce3..a5e7a08f1 100644 --- a/paddlespeech/s2t/training/scheduler.py +++ b/paddlespeech/s2t/training/scheduler.py @@ -220,7 +220,6 @@ class NewBobScheduler(LRScheduler): def load(self, data): """Loads the needed information.""" - data = paddle.load(data) self.last_epoch = data["current_epoch_index"] self.hyperparam_value = data["hyperparam_value"] self.metric_values = data["metric_values"] diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py index fdd8c0292..d7e7c6ca2 100644 --- a/paddlespeech/s2t/utils/utility.py +++ b/paddlespeech/s2t/utils/utility.py @@ -130,8 +130,11 @@ def get_subsample(config): Returns: int: subsample rate. """ - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] + if config['encoder'] == 'squeezeformer': + return 4 + else: + input_layer = config["encoder_conf"]["input_layer"] + assert input_layer in ["conv2d", "conv2d6", "conv2d8"] if input_layer == "conv2d": return 4 elif input_layer == "conv2d6": diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index c95d908dc..9ae791b48 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -414,6 +414,129 @@ def fastspeech2_multi_spk_batch_fn(examples): return batch +def diffsinger_single_spk_batch_fn(examples): + # fields = ["text", "note", "note_dur", "is_slur", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + note = [np.array(item["note"], dtype=np.int64) for item in examples] + note_dur = [np.array(item["note_dur"], dtype=np.float32) for item in examples] + is_slur = [np.array(item["is_slur"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] + energy = [np.array(item["energy"], dtype=np.float32) for item in examples] + durations = [ + np.array(item["durations"], dtype=np.int64) for item in examples + ] + + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + note = batch_sequences(note) + note_dur = batch_sequences(note_dur) + is_slur = batch_sequences(is_slur) + pitch = batch_sequences(pitch) + speech = batch_sequences(speech) + durations = batch_sequences(durations) + energy = batch_sequences(energy) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + note = paddle.to_tensor(note) + note_dur = paddle.to_tensor(note_dur) + is_slur = paddle.to_tensor(is_slur) + pitch = paddle.to_tensor(pitch) + speech = paddle.to_tensor(speech) + durations = paddle.to_tensor(durations) + energy = paddle.to_tensor(energy) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "note": note, + "note_dur": note_dur, + "is_slur": is_slur, + "text_lengths": text_lengths, + "durations": durations, + "speech": speech, + "speech_lengths": speech_lengths, + "pitch": pitch, + "energy": energy + } + return batch + + +def diffsinger_multi_spk_batch_fn(examples): + # fields = ["text", "note", "note_dur", "is_slur", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + note = [np.array(item["note"], dtype=np.int64) for item in examples] + note_dur = [np.array(item["note_dur"], dtype=np.float32) for item in examples] + is_slur = [np.array(item["is_slur"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] + energy = [np.array(item["energy"], dtype=np.float32) for item in examples] + durations = [ + np.array(item["durations"], dtype=np.int64) for item in examples + ] + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + note = batch_sequences(note) + note_dur = batch_sequences(note_dur) + is_slur = batch_sequences(is_slur) + pitch = batch_sequences(pitch) + speech = batch_sequences(speech) + durations = batch_sequences(durations) + energy = batch_sequences(energy) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + note = paddle.to_tensor(note) + note_dur = paddle.to_tensor(note_dur) + is_slur = paddle.to_tensor(is_slur) + pitch = paddle.to_tensor(pitch) + speech = paddle.to_tensor(speech) + durations = paddle.to_tensor(durations) + energy = paddle.to_tensor(energy) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "note": note, + "note_dur": note_dur, + "is_slur": is_slur, + "text_lengths": text_lengths, + "durations": durations, + "speech": speech, + "speech_lengths": speech_lengths, + "pitch": pitch, + "energy": energy + } + # spk_emb has a higher priority than spk_id + if "spk_emb" in examples[0]: + spk_emb = [ + np.array(item["spk_emb"], dtype=np.float32) for item in examples + ] + spk_emb = batch_sequences(spk_emb) + spk_emb = paddle.to_tensor(spk_emb) + batch["spk_emb"] = spk_emb + elif "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id + return batch + + def transformer_single_spk_batch_fn(examples): # fields = ["text", "text_lengths", "speech", "speech_lengths"] text = [np.array(item["text"], dtype=np.int64) for item in examples] diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py index 5ec97b810..ea273e245 100644 --- a/paddlespeech/t2s/datasets/get_feats.py +++ b/paddlespeech/t2s/datasets/get_feats.py @@ -12,17 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) +from typing import List +from typing import Optional +from typing import Union + import librosa import numpy as np import pyworld from scipy.interpolate import interp1d - -from typing import Optional -from typing import Union from typing_extensions import Literal - class LogMelFBank(): def __init__(self, sr: int=24000, @@ -79,7 +79,7 @@ class LogMelFBank(): def _spectrogram(self, wav: np.ndarray): D = self._stft(wav) - return np.abs(D) ** self.power + return np.abs(D)**self.power def _mel_spectrogram(self, wav: np.ndarray): S = self._spectrogram(wav) @@ -117,7 +117,6 @@ class Pitch(): if (f0 == 0).all(): print("All frames seems to be unvoiced, this utt will be removed.") return f0 - # padding start and end of f0 sequence start_f0 = f0[f0 != 0][0] end_f0 = f0[f0 != 0][-1] @@ -179,6 +178,8 @@ class Pitch(): f0 = self._calculate_f0(wav, use_continuous_f0, use_log_f0) if use_token_averaged_f0 and duration is not None: f0 = self._average_by_duration(f0, duration) + else: + f0 = np.expand_dims(np.array(f0), 0).T return f0 @@ -237,6 +238,8 @@ class Energy(): energy = self._calculate_energy(wav) if use_token_averaged_energy and duration is not None: energy = self._average_by_duration(energy, duration) + else: + energy = np.expand_dims(np.array(energy), 0).T return energy diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py index 445b69bda..bf813b22a 100644 --- a/paddlespeech/t2s/datasets/preprocess_utils.py +++ b/paddlespeech/t2s/datasets/preprocess_utils.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import re +from typing import List + +import librosa +import numpy as np # speaker|utt_id|phn dur phn dur ... @@ -41,6 +45,90 @@ def get_phn_dur(file_name): return sentence, speaker_set +def note2midi(notes: List[str]) -> List[str]: + """Covert note string to note id, for example: ["C1"] -> [24] + + Args: + notes (List[str]): the list of note string + + Returns: + List[str]: the list of note id + """ + midis = [] + for note in notes: + if note == 'rest': + midi = 0 + else: + midi = librosa.note_to_midi(note.split("/")[0]) + midis.append(midi) + + return midis + + +def time2frame( + times: List[float], + sample_rate: int=24000, + n_shift: int=128, ) -> List[int]: + """Convert the phoneme duration of time(s) into frames + + Args: + times (List[float]): phoneme duration of time(s) + sample_rate (int, optional): sample rate. Defaults to 24000. + n_shift (int, optional): frame shift. Defaults to 128. + + Returns: + List[int]: phoneme duration of frame + """ + end = 0.0 + ends = [] + for t in times: + end += t + ends.append(end) + frame_pos = librosa.time_to_frames(ends, sr=sample_rate, hop_length=n_shift) + durations = np.diff(frame_pos, prepend=0) + return durations + + +def get_sentences_svs( + file_name, + dataset: str='opencpop', + sample_rate: int=24000, + n_shift: int=128, ): + ''' + read label file + Args: + file_name (str or Path): path of gen_duration_from_textgrid.py's result + dataset (str): dataset name + Returns: + Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str) + tuple: speaker name + ''' + f = open(file_name, 'r') + sentence = {} + speaker_set = set() + if dataset == 'opencpop': + speaker_set.add("opencpop") + for line in f: + line_list = line.strip().split('|') + utt = line_list[0] + text = line_list[1] + ph = line_list[2].split() + midi = note2midi(line_list[3].split()) + midi_dur = line_list[4].split() + ph_dur = time2frame([float(t) for t in line_list[5].split()], sample_rate=sample_rate, n_shift=n_shift) + is_slur = line_list[6].split() + assert len(ph) == len(midi) == len(midi_dur) == len(is_slur) + sentence[utt] = (ph, [int(i) for i in ph_dur], + [int(i) for i in midi], + [float(i) for i in midi_dur], + [int(i) for i in is_slur], text, "opencpop") + else: + print("dataset should in {opencpop} now!") + + f.close() + return sentence, speaker_set + + def merge_silence(sentence): ''' merge silences @@ -88,6 +176,9 @@ def get_input_token(sentence, output_path, dataset="baker"): phn_token = ["", ""] + phn_token if dataset in {"baker", "aishell3"}: phn_token += [",", "。", "?", "!"] + # svs dataset + elif dataset in {"opencpop"}: + pass else: phn_token += [",", ".", "?", "!"] phn_token += [""] diff --git a/paddlespeech/t2s/exps/PTQ_static.py b/paddlespeech/t2s/exps/PTQ_static.py index 16b3ae983..a95786450 100644 --- a/paddlespeech/t2s/exps/PTQ_static.py +++ b/paddlespeech/t2s/exps/PTQ_static.py @@ -42,6 +42,8 @@ def parse_args(): 'hifigan_aishell3', 'hifigan_ljspeech', 'hifigan_vctk', + 'pwgan_opencpop', + 'hifigan_opencpop', ], help='Choose model type of tts task.') diff --git a/paddlespeech/t2s/exps/diffsinger/__init__.py b/paddlespeech/t2s/exps/diffsinger/__init__.py new file mode 100644 index 000000000..595add0ae --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py new file mode 100644 index 000000000..519808f2a --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py @@ -0,0 +1,240 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# generate mels using durations.txt +# for mb melgan finetune +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import yaml +from tqdm import tqdm +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs +from paddlespeech.t2s.models.diffsinger import DiffSinger +from paddlespeech.t2s.models.diffsinger import DiffSingerInference +from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.utils import str2bool + + +def evaluate(args, diffsinger_config): + rootdir = Path(args.rootdir).expanduser() + assert rootdir.is_dir() + + # construct dataset for evaluation + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + phone_dict = {} + for phn, id in phn_id: + phone_dict[phn] = int(id) + + if args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id_list = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id_list) + else: + spk_num = None + + with open(args.diffsinger_stretch, "r") as f: + spec_min = np.load(args.diffsinger_stretch)[0] + spec_max = np.load(args.diffsinger_stretch)[1] + spec_min = paddle.to_tensor(spec_min) + spec_max = paddle.to_tensor(spec_max) + print("min and max spec done!") + + odim = diffsinger_config.n_mels + diffsinger_config["model"]["fastspeech2_params"]["spk_num"] = spk_num + model = DiffSinger( + spec_min=spec_min, + spec_max=spec_max, + idim=vocab_size, + odim=odim, + **diffsinger_config["model"], ) + + model.set_state_dict(paddle.load(args.diffsinger_checkpoint)["main_params"]) + model.eval() + + stat = np.load(args.diffsinger_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + diffsinger_normalizer = ZScore(mu, std) + + diffsinger_inference = DiffSingerInference(diffsinger_normalizer, model) + diffsinger_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences, speaker_set = get_sentences_svs( + args.dur_file, + dataset=args.dataset, + sample_rate=diffsinger_config.fs, + n_shift=diffsinger_config.n_shift, ) + + if args.dataset == "opencpop": + wavdir = rootdir / "wavs" + # split data into 3 sections + train_file = rootdir / "train.txt" + train_wav_files = [] + with open(train_file, "r") as f_train: + for line in f_train.readlines(): + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + train_wav_files.append(wav_path) + + test_file = rootdir / "test.txt" + dev_wav_files = [] + test_wav_files = [] + num_dev = 106 + count = 0 + with open(test_file, "r") as f_test: + for line in f_test.readlines(): + count += 1 + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + if count > num_dev: + test_wav_files.append(wav_path) + else: + dev_wav_files.append(wav_path) + else: + print("dataset should in {opencpop} now!") + + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] + + for i, utt_id in enumerate(tqdm(sentences)): + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + note = sentences[utt_id][2] + note_dur = sentences[utt_id][3] + is_slur = sentences[utt_id][4] + speaker = sentences[utt_id][-1] + + phone_ids = [phone_dict[phn] for phn in phones] + phone_ids = paddle.to_tensor(np.array(phone_ids)) + + if args.speaker_dict: + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = paddle.to_tensor(speaker_id) + else: + speaker_id = None + + durations = paddle.to_tensor(np.array(durations)) + note = paddle.to_tensor(np.array(note)) + note_dur = paddle.to_tensor(np.array(note_dur)) + is_slur = paddle.to_tensor(np.array(is_slur)) + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 + # split data into 3 sections + + wav_path = utt_id + ".wav" + + if wav_path in train_wav_files: + sub_output_dir = output_dir / ("train/raw") + elif wav_path in dev_wav_files: + sub_output_dir = output_dir / ("dev/raw") + elif wav_path in test_wav_files: + sub_output_dir = output_dir / ("test/raw") + + sub_output_dir.mkdir(parents=True, exist_ok=True) + + with paddle.no_grad(): + mel = diffsinger_inference( + text=phone_ids, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=False) + np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Generate mel with diffsinger.") + parser.add_argument( + "--dataset", + default="opencpop", + type=str, + help="name of dataset, should in {opencpop} now") + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + parser.add_argument( + "--diffsinger-config", type=str, help="diffsinger config file.") + parser.add_argument( + "--diffsinger-checkpoint", + type=str, + help="diffsinger checkpoint to load.") + parser.add_argument( + "--diffsinger-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training diffsinger." + ) + parser.add_argument( + "--diffsinger-stretch", + type=str, + help="min and max mel used to stretch before training diffusion.") + + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") + + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.diffsinger_config) as f: + diffsinger_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(diffsinger_config) + + evaluate(args, diffsinger_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/diffsinger/get_minmax.py b/paddlespeech/t2s/exps/diffsinger/get_minmax.py new file mode 100644 index 000000000..5457f1e24 --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/get_minmax.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging + +import jsonlines +import numpy as np +from tqdm import tqdm + +from paddlespeech.t2s.datasets.data_table import DataTable + + +def get_minmax(spec, min_spec, max_spec): + # spec: [T, 80] + for i in range(spec.shape[1]): + min_value = np.min(spec[:, i]) + max_value = np.max(spec[:, i]) + min_spec[i] = min(min_value, min_spec[i]) + max_spec[i] = max(max_value, max_spec[i]) + + return min_spec, max_spec + + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)." + ) + parser.add_argument( + "--metadata", + type=str, + required=True, + help="directory including feature files to be normalized. " + "you need to specify either *-scp or rootdir.") + + parser.add_argument( + "--speech-stretchs", + type=str, + required=True, + help="min max spec file. only computer on train data") + + args = parser.parse_args() + + # get dataset + with jsonlines.open(args.metadata, 'r') as reader: + metadata = list(reader) + dataset = DataTable( + metadata, converters={ + "speech": np.load, + }) + logging.info(f"The number of files = {len(dataset)}.") + + n_mel = 80 + min_spec = 100.0 * np.ones(shape=(n_mel), dtype=np.float32) + max_spec = -100.0 * np.ones(shape=(n_mel), dtype=np.float32) + + for item in tqdm(dataset): + spec = item['speech'] + min_spec, max_spec = get_minmax(spec, min_spec, max_spec) + + # Using min_spec=-6.0 training effect is better so far + min_spec = -6.0 * np.ones(shape=(n_mel), dtype=np.float32) + min_max_spec = np.stack([min_spec, max_spec], axis=0) + np.save( + str(args.speech_stretchs), + min_max_spec.astype(np.float32), + allow_pickle=False) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/diffsinger/normalize.py b/paddlespeech/t2s/exps/diffsinger/normalize.py new file mode 100644 index 000000000..d3e611621 --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/normalize.py @@ -0,0 +1,189 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Normalize feature files and dump them.""" +import argparse +import logging +from operator import itemgetter +from pathlib import Path + +import jsonlines +import numpy as np +from sklearn.preprocessing import StandardScaler +from tqdm import tqdm + +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.utils import str2bool + + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)." + ) + parser.add_argument( + "--metadata", + type=str, + required=True, + help="directory including feature files to be normalized. " + "you need to specify either *-scp or rootdir.") + + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump normalized feature files.") + parser.add_argument( + "--speech-stats", + type=str, + required=True, + help="speech statistics file.") + parser.add_argument( + "--pitch-stats", type=str, required=True, help="pitch statistics file.") + parser.add_argument( + "--energy-stats", + type=str, + required=True, + help="energy statistics file.") + parser.add_argument( + "--phones-dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + "--norm-feats", + type=str2bool, + default=False, + help="whether to norm features") + + args = parser.parse_args() + + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + + # get dataset + with jsonlines.open(args.metadata, 'r') as reader: + metadata = list(reader) + dataset = DataTable( + metadata, + converters={ + "speech": np.load, + "pitch": np.load, + "energy": np.load, + }) + logging.info(f"The number of files = {len(dataset)}.") + + # restore scaler + speech_scaler = StandardScaler() + if args.norm_feats: + speech_scaler.mean_ = np.load(args.speech_stats)[0] + speech_scaler.scale_ = np.load(args.speech_stats)[1] + else: + speech_scaler.mean_ = np.zeros( + np.load(args.speech_stats)[0].shape, dtype="float32") + speech_scaler.scale_ = np.ones( + np.load(args.speech_stats)[1].shape, dtype="float32") + speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0] + + pitch_scaler = StandardScaler() + if args.norm_feats: + pitch_scaler.mean_ = np.load(args.pitch_stats)[0] + pitch_scaler.scale_ = np.load(args.pitch_stats)[1] + else: + pitch_scaler.mean_ = np.zeros( + np.load(args.pitch_stats)[0].shape, dtype="float32") + pitch_scaler.scale_ = np.ones( + np.load(args.pitch_stats)[1].shape, dtype="float32") + pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0] + + energy_scaler = StandardScaler() + if args.norm_feats: + energy_scaler.mean_ = np.load(args.energy_stats)[0] + energy_scaler.scale_ = np.load(args.energy_stats)[1] + else: + energy_scaler.mean_ = np.zeros( + np.load(args.energy_stats)[0].shape, dtype="float32") + energy_scaler.scale_ = np.ones( + np.load(args.energy_stats)[1].shape, dtype="float32") + energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0] + + vocab_phones = {} + with open(args.phones_dict, 'rt') as f: + phn_id = [line.strip().split() for line in f.readlines()] + for phn, id in phn_id: + vocab_phones[phn] = int(id) + + vocab_speaker = {} + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + for spk, id in spk_id: + vocab_speaker[spk] = int(id) + + # process each file + output_metadata = [] + + for item in tqdm(dataset): + utt_id = item['utt_id'] + speech = item['speech'] + pitch = item['pitch'] + energy = item['energy'] + # normalize + speech = speech_scaler.transform(speech) + speech_dir = dumpdir / "data_speech" + speech_dir.mkdir(parents=True, exist_ok=True) + speech_path = speech_dir / f"{utt_id}_speech.npy" + np.save(speech_path, speech.astype(np.float32), allow_pickle=False) + + pitch = pitch_scaler.transform(pitch) + pitch_dir = dumpdir / "data_pitch" + pitch_dir.mkdir(parents=True, exist_ok=True) + pitch_path = pitch_dir / f"{utt_id}_pitch.npy" + np.save(pitch_path, pitch.astype(np.float32), allow_pickle=False) + + energy = energy_scaler.transform(energy) + energy_dir = dumpdir / "data_energy" + energy_dir.mkdir(parents=True, exist_ok=True) + energy_path = energy_dir / f"{utt_id}_energy.npy" + np.save(energy_path, energy.astype(np.float32), allow_pickle=False) + phone_ids = [vocab_phones[p] for p in item['phones']] + spk_id = vocab_speaker[item["speaker"]] + record = { + "utt_id": item['utt_id'], + "spk_id": spk_id, + "text": phone_ids, + "text_lengths": item['text_lengths'], + "speech_lengths": item['speech_lengths'], + "durations": item['durations'], + "speech": str(speech_path), + "pitch": str(pitch_path), + "energy": str(energy_path), + "note": item['note'], + "note_dur": item['note_dur'], + "is_slur": item['is_slur'], + } + # add spk_emb for voice cloning + if "spk_emb" in item: + record["spk_emb"] = str(item["spk_emb"]) + + output_metadata.append(record) + output_metadata.sort(key=itemgetter('utt_id')) + output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" + with jsonlines.open(output_metadata_path, 'w') as writer: + for item in output_metadata: + writer.write(item) + logging.info(f"metadata dumped into {output_metadata_path}") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py new file mode 100644 index 000000000..be526eff1 --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/preprocess.py @@ -0,0 +1,376 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from concurrent.futures import ThreadPoolExecutor +from operator import itemgetter +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List + +import jsonlines +import librosa +import numpy as np +import tqdm +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.get_feats import Energy +from paddlespeech.t2s.datasets.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import Pitch +from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length +from paddlespeech.t2s.datasets.preprocess_utils import get_input_token +from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.utils import str2bool + +ALL_INITIALS = [ + 'zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', + 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w' +] +ALL_FINALS = [ + 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', + 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', + 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', + 'vn' +] + + +def process_sentence( + config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + cut_sil: bool=True, + spk_emb_dir: Path=None, ): + utt_id = fp.stem + record = None + if utt_id in sentences: + # reading, resampling may occur + wav, _ = librosa.load(str(fp), sr=config.fs) + if len(wav.shape) != 1: + return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value + assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(wav).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + note = sentences[utt_id][2] + note_dur = sentences[utt_id][3] + is_slur = sentences[utt_id][4] + speaker = sentences[utt_id][-1] + + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(wav) + # change duration according to mel_length + compare_duration_and_mel_length(sentences, utt_id, logmel) + # utt_id may be popped in compare_duration_and_mel_length + if utt_id not in sentences: + return None + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + num_frames = logmel.shape[0] + + assert sum( + durations + ) == num_frames, "the sum of durations doesn't equal to the num of mel frames. " + speech_dir = output_dir / "data_speech" + speech_dir.mkdir(parents=True, exist_ok=True) + speech_path = speech_dir / (utt_id + "_speech.npy") + np.save(speech_path, logmel) + # extract pitch and energy + pitch = pitch_extractor.get_pitch(wav) + assert pitch.shape[0] == num_frames + pitch_dir = output_dir / "data_pitch" + pitch_dir.mkdir(parents=True, exist_ok=True) + pitch_path = pitch_dir / (utt_id + "_pitch.npy") + np.save(pitch_path, pitch) + energy = energy_extractor.get_energy(wav) + assert energy.shape[0] == num_frames + energy_dir = output_dir / "data_energy" + energy_dir.mkdir(parents=True, exist_ok=True) + energy_path = energy_dir / (utt_id + "_energy.npy") + np.save(energy_path, energy) + + record = { + "utt_id": utt_id, + "phones": phones, + "text_lengths": len(phones), + "speech_lengths": num_frames, + "durations": durations, + "speech": str(speech_path), + "pitch": str(pitch_path), + "energy": str(energy_path), + "speaker": speaker, + "note": note, + "note_dur": note_dur, + "is_slur": is_slur, + } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None + return record + + +def process_sentences( + config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + nprocs: int=1, + cut_sil: bool=True, + spk_emb_dir: Path=None, + write_metadata_method: str='w', ): + if nprocs == 1: + results = [] + for fp in tqdm.tqdm(fps, total=len(fps)): + record = process_sentence( + config=config, + fp=fp, + sentences=sentences, + output_dir=output_dir, + mel_extractor=mel_extractor, + pitch_extractor=pitch_extractor, + energy_extractor=energy_extractor, + cut_sil=cut_sil, + spk_emb_dir=spk_emb_dir, ) + if record: + results.append(record) + else: + with ThreadPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp in fps: + future = pool.submit( + process_sentence, + config, + fp, + sentences, + output_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + cut_sil, + spk_emb_dir, ) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + record = ft.result() + if record: + results.append(record) + + results.sort(key=itemgetter("utt_id")) + with jsonlines.open(output_dir / "metadata.jsonl", + write_metadata_method) as writer: + for item in results: + writer.write(item) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features.") + + parser.add_argument( + "--dataset", + default="opencpop", + type=str, + help="name of dataset, should in {opencpop} now") + + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump feature files.") + + parser.add_argument( + "--label-file", default=None, type=str, help="path to label file.") + + parser.add_argument("--config", type=str, help="diffsinger config file.") + + parser.add_argument( + "--num-cpu", type=int, default=1, help="number of process.") + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") + + parser.add_argument( + "--write_metadata_method", + default="w", + type=str, + choices=["w", "a"], + help="How the metadata.jsonl file is written.") + args = parser.parse_args() + + rootdir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + label_file = Path(args.label_file).expanduser() + + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + + assert rootdir.is_dir() + assert label_file.is_file() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + sentences, speaker_set = get_sentences_svs( + label_file, + dataset=args.dataset, + sample_rate=config.fs, + n_shift=config.n_shift, ) + + phone_id_map_path = dumpdir / "phone_id_map.txt" + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_input_token(sentences, phone_id_map_path, args.dataset) + get_spk_id_map(speaker_set, speaker_id_map_path) + + if args.dataset == "opencpop": + wavdir = rootdir / "wavs" + # split data into 3 sections + train_file = rootdir / "train.txt" + train_wav_files = [] + with open(train_file, "r") as f_train: + for line in f_train.readlines(): + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + train_wav_files.append(wav_path) + + test_file = rootdir / "test.txt" + dev_wav_files = [] + test_wav_files = [] + num_dev = 106 + count = 0 + with open(test_file, "r") as f_test: + for line in f_test.readlines(): + count += 1 + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + if count > num_dev: + test_wav_files.append(wav_path) + else: + dev_wav_files.append(wav_path) + + else: + print("dataset should in {opencpop} now!") + + train_dump_dir = dumpdir / "train" / "raw" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" / "raw" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" / "raw" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # Extractor + mel_extractor = LogMelFBank( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + pitch_extractor = Pitch( + sr=config.fs, + hop_length=config.n_shift, + f0min=config.f0min, + f0max=config.f0max) + energy_extractor = Energy( + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window) + + # process for the 3 sections + if train_wav_files: + process_sentences( + config=config, + fps=train_wav_files, + sentences=sentences, + output_dir=train_dump_dir, + mel_extractor=mel_extractor, + pitch_extractor=pitch_extractor, + energy_extractor=energy_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir, + write_metadata_method=args.write_metadata_method) + if dev_wav_files: + process_sentences( + config=config, + fps=dev_wav_files, + sentences=sentences, + output_dir=dev_dump_dir, + mel_extractor=mel_extractor, + pitch_extractor=pitch_extractor, + energy_extractor=energy_extractor, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir, + write_metadata_method=args.write_metadata_method) + if test_wav_files: + process_sentences( + config=config, + fps=test_wav_files, + sentences=sentences, + output_dir=test_dump_dir, + mel_extractor=mel_extractor, + pitch_extractor=pitch_extractor, + energy_extractor=energy_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir, + write_metadata_method=args.write_metadata_method) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py new file mode 100644 index 000000000..e79104c4a --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/train.py @@ -0,0 +1,257 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle import nn +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.optimizer import AdamW +from paddle.optimizer.lr import StepDecay +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_single_spk_batch_fn +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.diffsinger import DiffSinger +from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator +from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater +from paddlespeech.t2s.models.diffsinger import DiffusionLoss +from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDILoss +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.optimizer import build_optimizers +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + fields = [ + "text", "text_lengths", "speech", "speech_lengths", "durations", + "pitch", "energy", "note", "note_dur", "is_slur" + ] + converters = {"speech": np.load, "pitch": np.load, "energy": np.load} + spk_num = None + if args.speaker_dict is not None: + print("multiple speaker diffsinger!") + collate_fn = diffsinger_multi_spk_batch_fn + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id) + fields += ["spk_id"] + else: + collate_fn = diffsinger_single_spk_batch_fn + print("single speaker diffsinger!") + + print("spk_num:", spk_num) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=fields, + converters=converters, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=fields, + converters=converters, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=collate_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + shuffle=False, + drop_last=False, + batch_size=config.batch_size, + collate_fn=collate_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + with open(args.speech_stretchs, "r") as f: + spec_min = np.load(args.speech_stretchs)[0] + spec_max = np.load(args.speech_stretchs)[1] + spec_min = paddle.to_tensor(spec_min) + spec_max = paddle.to_tensor(spec_max) + print("min and max spec done!") + + odim = config.n_mels + config["model"]["fastspeech2_params"]["spk_num"] = spk_num + model = DiffSinger( + spec_min=spec_min, + spec_max=spec_max, + idim=vocab_size, + odim=odim, + **config["model"], ) + model_fs2 = model.fs2 + model_ds = model.diffusion + if world_size > 1: + model = DataParallel(model) + model_fs2 = model._layers.fs2 + model_ds = model._layers.diffusion + print("models done!") + + criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"]) + criterion_ds = DiffusionLoss(**config["ds_updater"]) + print("criterions done!") + + optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"]) + lr_schedule_ds = StepDecay(**config["ds_scheduler_params"]) + gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"]) + optimizer_ds = AdamW( + learning_rate=lr_schedule_ds, + grad_clip=gradient_clip_ds, + parameters=model_ds.parameters(), + **config["ds_optimizer_params"]) + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = DiffSingerUpdater( + model=model, + optimizers={ + "fs2": optimizer_fs2, + "ds": optimizer_ds, + }, + criterions={ + "fs2": criterion_fs2, + "ds": criterion_ds, + }, + dataloader=train_dataloader, + ds_train_start_steps=config.ds_train_start_steps, + output_dir=output_dir, + only_train_diffusion=config["only_train_diffusion"]) + + evaluator = DiffSingerEvaluator( + model=model, + criterions={ + "fs2": criterion_fs2, + "ds": criterion_ds, + }, + dataloader=dev_dataloader, + output_dir=output_dir, ) + + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir, ) + + if dist.get_rank() == 0: + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) + + print("Trainer Done!") + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a DiffSinger model.") + parser.add_argument("--config", type=str, help="diffsinger config file.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + parser.add_argument( + "--phones-dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--speaker-dict", + type=str, + default=None, + help="speaker id map file for multiple speaker model.") + parser.add_argument( + "--speech-stretchs", + type=str, + help="The min and max values of the mel spectrum.") + + args = parser.parse_args() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/dygraph_to_static.py b/paddlespeech/t2s/exps/dygraph_to_static.py new file mode 100644 index 000000000..5e15ca4ca --- /dev/null +++ b/paddlespeech/t2s/exps/dygraph_to_static.py @@ -0,0 +1,170 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.exps.syn_utils import am_to_static +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import voc_to_static + + +def am_dygraph_to_static(args): + with open(args.am_config) as f: + am_config = CfgNode(yaml.safe_load(f)) + am_inference = get_am_inference( + am=args.am, + am_config=am_config, + am_ckpt=args.am_ckpt, + am_stat=args.am_stat, + phones_dict=args.phones_dict, + tones_dict=args.tones_dict, + speaker_dict=args.speaker_dict) + print("acoustic model done!") + + # dygraph to static + am_inference = am_to_static( + am_inference=am_inference, + am=args.am, + inference_dir=args.inference_dir, + speaker_dict=args.speaker_dict) + print("finish to convert dygraph acoustic model to static!") + + +def voc_dygraph_to_static(args): + with open(args.voc_config) as f: + voc_config = CfgNode(yaml.safe_load(f)) + voc_inference = get_voc_inference( + voc=args.voc, + voc_config=voc_config, + voc_ckpt=args.voc_ckpt, + voc_stat=args.voc_stat) + print("voc done!") + + # dygraph to static + voc_inference = voc_to_static( + voc_inference=voc_inference, + voc=args.voc, + inference_dir=args.inference_dir) + print("finish to convert dygraph vocoder model to static!") + + +def parse_args(): + # parse args and config + parser = argparse.ArgumentParser( + description="Synthesize with acoustic model & vocoder") + parser.add_argument( + '--type', + type=str, + required=True, + choices=["am", "voc"], + help='Choose the model type of dynamic to static, am or voc') + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'speedyspeech_csmsc', + 'speedyspeech_aishell3', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', + 'fastspeech2_mix', + 'fastspeech2_canton', + 'fastspeech2_male-zh', + 'fastspeech2_male-en', + 'fastspeech2_male-mix', + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + '--am_config', type=str, default=None, help='Config of acoustic model.') + parser.add_argument( + '--am_ckpt', + type=str, + default=None, + help='Checkpoint file of acoustic model.') + parser.add_argument( + "--am_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + # vocoder + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=[ + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'style_melgan_csmsc', + 'hifigan_csmsc', + 'hifigan_ljspeech', + 'hifigan_aishell3', + 'hifigan_vctk', + 'wavernn_csmsc', + 'pwgan_male', + 'hifigan_male', + 'pwgan_opencpop', + 'hifigan_opencpop', + ], + help='Choose vocoder type of tts task.') + parser.add_argument( + '--voc_config', type=str, default=None, help='Config of voc.') + parser.add_argument( + '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') + parser.add_argument( + "--voc_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training voc." + ) + # other + parser.add_argument( + "--inference_dir", + type=str, + default=None, + help="dir to save inference models") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + if args.type == "am": + am_dygraph_to_static(args) + elif args.type == "voc": + voc_dygraph_to_static(args) + else: + print("type should be in ['am', 'voc'] !") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 05c657682..a2629a900 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -29,6 +29,7 @@ from yacs.config import CfgNode from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.utils import str2bool @@ -192,8 +193,15 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - sentences, speaker_set = get_phn_dur(dur_file) - merge_silence(sentences) + if args.dataset == "opencpop": + sentences, speaker_set = get_sentences_svs( + dur_file, + dataset=args.dataset, + sample_rate=config.fs, + n_shift=config.n_shift, ) + else: + sentences, speaker_set = get_phn_dur(dur_file) + merge_silence(sentences) # split data into 3 sections if args.dataset == "baker": @@ -240,6 +248,33 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "opencpop": + wavdir = rootdir / "wavs" + # split data into 3 sections + train_file = rootdir / "train.txt" + train_wav_files = [] + with open(train_file, "r") as f_train: + for line in f_train.readlines(): + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + train_wav_files.append(wav_path) + + test_file = rootdir / "test.txt" + dev_wav_files = [] + test_wav_files = [] + num_dev = 106 + count = 0 + with open(test_file, "r") as f_test: + for line in f_test.readlines(): + count += 1 + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + if count > num_dev: + test_wav_files.append(wav_path) + else: + dev_wav_files.append(wav_path) else: print("dataset should in {baker, ljspeech, vctk, aishell3} now!") diff --git a/paddlespeech/t2s/exps/sentences_sing.txt b/paddlespeech/t2s/exps/sentences_sing.txt new file mode 100644 index 000000000..7b9c6272d --- /dev/null +++ b/paddlespeech/t2s/exps/sentences_sing.txt @@ -0,0 +1,2 @@ +{"utt_id": "2093003457", "input_type": "word", "text": "小酒窝长睫毛AP是你最美的记号", "notes": "C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4", "note_durs": "0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340"} +{"utt_id": "2093003458", "input_type": "phoneme", "phones": "w o m ei t ian sh ui ui b u u zh ao AP x iang n ian n i d e w ei x iao iao AP" , "notes": "C#4/Db4 C#4/Db4 D#4/Eb4 D#4/Eb4 F4 F4 F#4/Gb4 F#4/Gb4 D#4/Eb4 D#4/Eb4 D#4/Eb4 A#3/Bb3 A#3/Bb3 A#3/Bb3 rest F#4/Gb4 F#4/Gb4 F4 F4 F#4/Gb4 F#4/Gb4 F4 F4 G#4/Ab4 G#4/Ab4 D#4/Eb4 D#4/Eb4 C#4/Db4 rest", "note_durs": "0.221750 0.221750 0.414460 0.414460 0.223160 0.223160 0.430900 0.430900 0.335990 0.269270 0.269270 0.289060 0.522690 0.522690 0.355060 0.397130 0.397130 0.247690 0.247690 0.406720 0.406720 0.246830 0.246830 0.307540 0.307540 0.429910 0.429910 0.519130 0.342300", "is_slurs": "0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0"} \ No newline at end of file diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 12b75615e..2b958b567 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -20,6 +20,7 @@ from typing import Dict from typing import List from typing import Optional +import jsonlines import numpy as np import onnxruntime as ort import paddle @@ -35,6 +36,7 @@ from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend from paddlespeech.t2s.frontend.mix_frontend import MixFrontend +from paddlespeech.t2s.frontend.sing_frontend import SingFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import @@ -56,6 +58,11 @@ model_alias = { "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", + "diffsinger": + "paddlespeech.t2s.models.diffsinger:DiffSinger", + "diffsinger_inference": + "paddlespeech.t2s.models.diffsinger:DiffSingerInference", + # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -122,6 +129,19 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): return sentences +# input for svs +def get_sentences_svs(text_file: Optional[os.PathLike]): + # construct dataset for evaluation + sentences = [] + with jsonlines.open(text_file, 'r') as reader: + svs_inputs = list(reader) + for svs_input in svs_inputs: + utt_id = svs_input['utt_id'] + sentence = svs_input + sentences.append((utt_id, sentence)) + return sentences + + # am only def get_test_dataset(test_metadata: List[Dict[str, Any]], am: str, @@ -142,6 +162,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]], fields += ["spk_emb"] else: print("single speaker fastspeech2!") + elif am_name == 'diffsinger': + fields = ["utt_id", "text", "note", "note_dur", "is_slur"] elif am_name == 'speedyspeech': fields = ["utt_id", "phones", "tones"] elif am_name == 'tacotron2': @@ -261,6 +283,7 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]], def get_frontend(lang: str='zh', phones_dict: Optional[os.PathLike]=None, tones_dict: Optional[os.PathLike]=None, + pinyin_phone: Optional[os.PathLike]=None, use_rhy=False): if lang == 'zh': frontend = Frontend( @@ -274,18 +297,23 @@ def get_frontend(lang: str='zh', elif lang == 'mix': frontend = MixFrontend( phone_vocab_path=phones_dict, tone_vocab_path=tones_dict) + elif lang == 'sing': + frontend = SingFrontend( + pinyin_phone_path=pinyin_phone, phone_vocab_path=phones_dict) else: print("wrong lang!") return frontend -def run_frontend(frontend: object, - text: str, - merge_sentences: bool=False, - get_tone_ids: bool=False, - lang: str='zh', - to_tensor: bool=True, - add_blank: bool=False): +def run_frontend( + frontend: object, + text: str, + merge_sentences: bool=False, + get_tone_ids: bool=False, + lang: str='zh', + to_tensor: bool=True, + add_blank: bool=False, + svs_input: Dict[str, str]=None, ): outs = dict() if lang == 'zh': input_ids = {} @@ -319,21 +347,33 @@ def run_frontend(frontend: object, input_ids = frontend.get_input_ids( text, merge_sentences=merge_sentences, to_tensor=to_tensor) phone_ids = input_ids["phone_ids"] + elif lang == 'sing': + input_ids = frontend.get_input_ids( + svs_input=svs_input, to_tensor=to_tensor) + phone_ids = input_ids["phone_ids"] + note_ids = input_ids["note_ids"] + note_durs = input_ids["note_durs"] + is_slurs = input_ids["is_slurs"] + outs.update({'note_ids': note_ids}) + outs.update({'note_durs': note_durs}) + outs.update({'is_slurs': is_slurs}) else: - print("lang should in {'zh', 'en', 'mix', 'canton'}!") + print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!") outs.update({'phone_ids': phone_ids}) return outs # dygraph -def get_am_inference(am: str='fastspeech2_csmsc', - am_config: CfgNode=None, - am_ckpt: Optional[os.PathLike]=None, - am_stat: Optional[os.PathLike]=None, - phones_dict: Optional[os.PathLike]=None, - tones_dict: Optional[os.PathLike]=None, - speaker_dict: Optional[os.PathLike]=None, - return_am: bool=False): +def get_am_inference( + am: str='fastspeech2_csmsc', + am_config: CfgNode=None, + am_ckpt: Optional[os.PathLike]=None, + am_stat: Optional[os.PathLike]=None, + phones_dict: Optional[os.PathLike]=None, + tones_dict: Optional[os.PathLike]=None, + speaker_dict: Optional[os.PathLike]=None, + return_am: bool=False, + speech_stretchs: Optional[os.PathLike]=None, ): with open(phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) @@ -356,6 +396,19 @@ def get_am_inference(am: str='fastspeech2_csmsc', if am_name == 'fastspeech2': am = am_class( idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) + elif am_name == 'diffsinger': + with open(speech_stretchs, "r") as f: + spec_min = np.load(speech_stretchs)[0] + spec_max = np.load(speech_stretchs)[1] + spec_min = paddle.to_tensor(spec_min) + spec_max = paddle.to_tensor(spec_max) + am_config["model"]["fastspeech2_params"]["spk_num"] = spk_num + am = am_class( + spec_min=spec_min, + spec_max=spec_max, + idim=vocab_size, + odim=odim, + **am_config["model"], ) elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, @@ -366,8 +419,6 @@ def get_am_inference(am: str='fastspeech2_csmsc', am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) elif am_name == 'erniesat': am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - else: - print("wrong am, please input right am!!!") am.set_state_dict(paddle.load(am_ckpt)["main_params"]) am.eval() @@ -454,6 +505,7 @@ def am_to_static(am_inference, elif am_name == 'tacotron2': am_inference = jit.to_static( am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + elif am_name == 'vits': if am_dataset in {"aishell3", "vctk"} and speaker_dict is not None: am_inference = jit.to_static( @@ -465,8 +517,20 @@ def am_to_static(am_inference, else: am_inference = jit.to_static( am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + + elif am_name == 'diffsinger': + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), # phone + InputSpec([-1], dtype=paddle.int64), # note + InputSpec([-1], dtype=paddle.float32), # note_dur + InputSpec([-1], dtype=paddle.int64), # is_slur + ]) + jit.save(am_inference, os.path.join(inference_dir, am)) am_inference = jit.load(os.path.join(inference_dir, am)) + return am_inference @@ -490,6 +554,7 @@ def get_predictor( device: str='cpu', # for gpu use_trt: bool=False, + device_id: int=0, # for trt use_dynamic_shape: bool=True, min_subgraph_size: int=5, @@ -505,6 +570,7 @@ def get_predictor( params_file (os.PathLike): name of params_file. device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu. use_trt (bool): whether to use TensorRT or not in GPU. + device_id (int): Choose your device id, only valid when the device is gpu, default 0. use_dynamic_shape (bool): use dynamic shape or not in TensorRT. use_mkldnn (bool): whether to use MKLDNN or not in CPU. cpu_threads (int): num of thread when use CPU. @@ -521,7 +587,7 @@ def get_predictor( config.enable_memory_optim() config.switch_ir_optim(True) if device == "gpu": - config.enable_use_gpu(100, 0) + config.enable_use_gpu(100, device_id) else: config.disable_gpu() config.set_cpu_math_library_num_threads(cpu_threads) diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 70e52244f..6189522db 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -60,7 +60,8 @@ def evaluate(args): am_stat=args.am_stat, phones_dict=args.phones_dict, tones_dict=args.tones_dict, - speaker_dict=args.speaker_dict) + speaker_dict=args.speaker_dict, + speech_stretchs=args.speech_stretchs, ) test_dataset = get_test_dataset( test_metadata=test_metadata, am=args.am, @@ -107,6 +108,20 @@ def evaluate(args): if args.voice_cloning and "spk_emb" in datum: spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) mel = am_inference(phone_ids, spk_emb=spk_emb) + elif am_name == 'diffsinger': + phone_ids = paddle.to_tensor(datum["text"]) + note = paddle.to_tensor(datum["note"]) + note_dur = paddle.to_tensor(datum["note_dur"]) + is_slur = paddle.to_tensor(datum["is_slur"]) + # get_mel_fs2 = False, means mel from diffusion, get_mel_fs2 = True, means mel from fastspeech2. + get_mel_fs2 = False + # mel: [T, mel_bin] + mel = am_inference( + phone_ids, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=get_mel_fs2) # vocoder wav = voc_inference(mel) @@ -134,10 +149,17 @@ def parse_args(): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc', - 'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix', - 'fastspeech2_canton' + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', + 'tacotron2_aishell3', + 'fastspeech2_mix', + 'fastspeech2_canton', + 'diffsinger_opencpop', ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -170,10 +192,19 @@ def parse_args(): type=str, default='pwgan_csmsc', choices=[ - 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc', - 'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk', - 'style_melgan_csmsc' + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'wavernn_csmsc', + 'hifigan_csmsc', + 'hifigan_ljspeech', + 'hifigan_aishell3', + 'hifigan_vctk', + 'style_melgan_csmsc', + "pwgan_opencpop", + "hifigan_opencpop", ], help='Choose vocoder type of tts task.') parser.add_argument( @@ -191,6 +222,11 @@ def parse_args(): "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--output_dir", type=str, help="output dir.") + parser.add_argument( + "--speech_stretchs", + type=str, + default=None, + help="The min and max values of the mel spectrum.") args = parser.parse_args() return args diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index db94a6e53..0c7b34b09 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -24,6 +24,7 @@ from paddlespeech.t2s.exps.syn_utils import am_to_static from paddlespeech.t2s.exps.syn_utils import get_am_inference from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import get_sentences_svs from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.exps.syn_utils import run_frontend from paddlespeech.t2s.exps.syn_utils import voc_to_static @@ -44,20 +45,18 @@ def evaluate(args): print(am_config) print(voc_config) - sentences = get_sentences(text_file=args.text, lang=args.lang) - # frontend frontend = get_frontend( lang=args.lang, phones_dict=args.phones_dict, tones_dict=args.tones_dict, + pinyin_phone=args.pinyin_phone, use_rhy=args.use_rhy) print("frontend done!") # acoustic model am_name = args.am[:args.am.rindex('_')] am_dataset = args.am[args.am.rindex('_') + 1:] - am_inference = get_am_inference( am=args.am, am_config=am_config, @@ -65,8 +64,10 @@ def evaluate(args): am_stat=args.am_stat, phones_dict=args.phones_dict, tones_dict=args.tones_dict, - speaker_dict=args.speaker_dict) + speaker_dict=args.speaker_dict, + speech_stretchs=args.speech_stretchs, ) print("acoustic model done!") + # vocoder voc_inference = get_voc_inference( voc=args.voc, @@ -103,14 +104,25 @@ def evaluate(args): N = 0 T = 0 + if am_name == 'diffsinger': + sentences = get_sentences_svs(text_file=args.text) + else: + sentences = get_sentences(text_file=args.text, lang=args.lang) for utt_id, sentence in sentences: with timer() as t: + if am_name == "diffsinger": + text = "" + svs_input = sentence + else: + text = sentence + svs_input = None frontend_dict = run_frontend( frontend=frontend, - text=sentence, + text=text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, - lang=args.lang) + lang=args.lang, + svs_input=svs_input) phone_ids = frontend_dict['phone_ids'] with paddle.no_grad(): flags = 0 @@ -134,6 +146,15 @@ def evaluate(args): mel = am_inference(part_phone_ids, part_tone_ids) elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) + elif am_name == 'diffsinger': + part_note_ids = frontend_dict['note_ids'][i] + part_note_durs = frontend_dict['note_durs'][i] + part_is_slurs = frontend_dict['is_slurs'][i] + mel = am_inference( + text=part_phone_ids, + note=part_note_ids, + note_dur=part_note_durs, + is_slur=part_is_slurs, ) # vocoder wav = voc_inference(mel) if flags == 0: @@ -178,6 +199,7 @@ def parse_args(): 'fastspeech2_male-zh', 'fastspeech2_male-en', 'fastspeech2_male-mix', + 'diffsinger_opencpop', ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -223,6 +245,8 @@ def parse_args(): 'wavernn_csmsc', 'pwgan_male', 'hifigan_male', + 'pwgan_opencpop', + 'hifigan_opencpop', ], help='Choose vocoder type of tts task.') parser.add_argument( @@ -240,6 +264,7 @@ def parse_args(): '--lang', type=str, default='zh', + choices=['zh', 'en', 'mix', 'canton', 'sing'], help='Choose model language. zh or en or mix') parser.add_argument( @@ -259,6 +284,17 @@ def parse_args(): type=str2bool, default=False, help="run rhythm frontend or not") + parser.add_argument( + "--pinyin_phone", + type=str, + default=None, + help="pinyin to phone map file, using on sing_frontend.") + parser.add_argument( + "--speech_stretchs", + type=str, + default=None, + help="The min and max values of the mel spectrum, using on diffusion of diffsinger." + ) args = parser.parse_args() return args diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py new file mode 100644 index 000000000..c2aecf273 --- /dev/null +++ b/paddlespeech/t2s/frontend/sing_frontend.py @@ -0,0 +1,175 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from typing import Dict +from typing import List + +import librosa +import numpy as np +import paddle +from pypinyin import lazy_pinyin + + +class SingFrontend(): + def __init__(self, pinyin_phone_path: str, phone_vocab_path: str): + """SVS Frontend + + Args: + pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line. + phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line. + """ + self.punc = '[:,;。?!“”‘’\':,;.?!]' + + self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'} + if pinyin_phone_path: + with open(pinyin_phone_path, 'rt', encoding='utf-8') as f: + for line in f.readlines(): + pinyin_phn = [ + x.strip() for x in line.split('|') if x.strip() != '' + ] + self.pinyin_phones[pinyin_phn[0]] = pinyin_phn[1] + + self.vocab_phones = {} + if phone_vocab_path: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: + phn_id = [line.strip().split() for line in f.readlines()] + for phn, id in phn_id: + self.vocab_phones[phn] = int(id) + + def get_phones(self, sentence: str) -> List[int]: + """get phone list + + Args: + sentence (str): sentence + + Returns: + List[int]: phones list + + Example: + sentence = "你好" + phones = ['n i', 'h ao'] + """ + # remove all punc + sentence = re.sub(self.punc, "", sentence) + + # Pypinyin can't solve polyphonic words + sentence = sentence.replace('最长', '最常').replace('长睫毛', '常睫毛') \ + .replace('那么长', '那么常').replace('多长', '多常') \ + .replace('很长', '很常') + + # lyric + pinyins = lazy_pinyin(sentence, strict=False) + # replace unk word with SP + pinyins = [ + pinyin if pinyin in self.pinyin_phones.keys() else "SP" + for pinyin in pinyins + ] + phones = [ + self.pinyin_phones[pinyin.strip()] for pinyin in pinyins + if pinyin.strip() in self.pinyin_phones + ] + + return phones + + def get_note_info(self, note_info: str) -> List[str]: + note_info = [x.strip() for x in note_info.split('|') if x.strip() != ''] + return note_info + + def process( + self, + phones: List[int], + notes: List[str], + note_durs: List[float], ) -> Dict[str, List[paddle.Tensor]]: + new_phones = [] + new_notes = [] + new_note_durs = [] + is_slurs = [] + assert len(phones) == len(notes) == len( + note_durs + ), "Please check the input, text, notes, note_durs should be the same length." + for i in range(len(phones)): + phone = phones[i].split() + note = notes[i].split() + note_dur = note_durs[i].split() + + for phn in phone: + new_phones.append(phn) + new_notes.append(note[0]) + new_note_durs.append(note_dur[0]) + is_slurs.append(0) + + if len(note) > 1: + for i in range(1, len(note)): + new_phones.append(phone[-1]) + new_notes.append(note[i]) + new_note_durs.append(note_dur[i]) + is_slurs.append(1) + + return new_phones, new_notes, new_note_durs, is_slurs + + def get_input_ids(self, svs_input: Dict[str, str], + to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + """convert input to int/float. + + Args: + svs_input (Dict[str, str]): include keys: if input_type is phones, phones, notes, note_durs and is_slurs are needed. + if input_type is word, text, notes, and note_durs sre needed. + to_tensor (bool, optional): whether to convert to Tensor. Defaults to True. + + Returns: + Dict[str, List[paddle.Tensor]]: result include phone_ids, note_ids, note_durs, is_slurs. + """ + result = {} + input_type = svs_input['input_type'] + if input_type == 'phoneme': + assert "phones" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys() and "is_slurs" in svs_input.keys(), \ + "When input_type is phoneme, phones, notes, note_durs, is_slurs should be in the svs_input." + phones = svs_input["phones"].split() + notes = svs_input["notes"].split() + note_durs = svs_input["note_durs"].split() + is_slurs = svs_input["is_slurs"].split() + assert len(phones) == len(notes) == len(note_durs) == len( + is_slurs + ), "Please check the input, phones, notes, note_durs is_slurs should be the same length." + elif input_type == "word": + assert "text" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys(), \ + "When input_type is word, text, notes, note_durs, should be in the svs_input." + phones = self.get_phones(svs_input['text']) + notes = self.get_note_info(svs_input['notes']) + note_durs = self.get_note_info(svs_input['note_durs']) + phones, notes, note_durs, is_slurs = self.process( + phones=phones, notes=notes, note_durs=note_durs) + + phone_ids = [self.vocab_phones[phn] for phn in phones] + phone_ids = np.array(phone_ids, np.int64) + note_ids = [ + librosa.note_to_midi(note.split("/")[0]) if note != 'rest' else 0 + for note in notes + ] + note_ids = np.array(note_ids, np.int64) + note_durs = np.array(note_durs, np.float32) + is_slurs = np.array(is_slurs, np.int64) + + if to_tensor: + phone_ids = paddle.to_tensor(phone_ids) + note_ids = paddle.to_tensor(note_ids) + note_durs = paddle.to_tensor(note_durs) + is_slurs = paddle.to_tensor(is_slurs) + + result['phone_ids'] = [phone_ids] + result['note_ids'] = [note_ids] + result['note_durs'] = [note_durs] + result['is_slurs'] = [is_slurs] + + return result diff --git a/paddlespeech/t2s/models/diffsinger/__init__.py b/paddlespeech/t2s/models/diffsinger/__init__.py new file mode 100644 index 000000000..785293ee2 --- /dev/null +++ b/paddlespeech/t2s/models/diffsinger/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .diffsinger import * +from .diffsinger_updater import * diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py new file mode 100644 index 000000000..990cfc56a --- /dev/null +++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py @@ -0,0 +1,399 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +"""DiffSinger related modules for paddle""" +from typing import Any +from typing import Dict +from typing import Tuple + +import numpy as np +import paddle +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI +from paddlespeech.t2s.modules.diffnet import DiffNet +from paddlespeech.t2s.modules.diffusion import GaussianDiffusion + + +class DiffSinger(nn.Layer): + """DiffSinger module. + + This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._ + .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`: + https://arxiv.org/pdf/2105.02446.pdf + + Args: + + Returns: + + """ + + def __init__( + self, + # min and max spec for stretching before diffusion + spec_min: paddle.Tensor, + spec_max: paddle.Tensor, + # fastspeech2midi config + idim: int, + odim: int, + use_energy_pred: bool=False, + use_postnet: bool=False, + # music score related + note_num: int=300, + is_slur_num: int=2, + fastspeech2_params: Dict[str, Any]={ + "adim": 256, + "aheads": 2, + "elayers": 4, + "eunits": 1024, + "dlayers": 4, + "dunits": 1024, + "positionwise_layer_type": "conv1d", + "positionwise_conv_kernel_size": 1, + "use_scaled_pos_enc": True, + "use_batch_norm": True, + "encoder_normalize_before": True, + "decoder_normalize_before": True, + "encoder_concat_after": False, + "decoder_concat_after": False, + "reduction_factor": 1, + # for transformer + "transformer_enc_dropout_rate": 0.1, + "transformer_enc_positional_dropout_rate": 0.1, + "transformer_enc_attn_dropout_rate": 0.1, + "transformer_dec_dropout_rate": 0.1, + "transformer_dec_positional_dropout_rate": 0.1, + "transformer_dec_attn_dropout_rate": 0.1, + "transformer_activation_type": "gelu", + # duration predictor + "duration_predictor_layers": 2, + "duration_predictor_chans": 384, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_rate": 0.1, + # pitch predictor + "use_pitch_embed": True, + "pitch_predictor_layers": 2, + "pitch_predictor_chans": 384, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout": 0.5, + "pitch_embed_kernel_size": 9, + "pitch_embed_dropout": 0.5, + "stop_gradient_from_pitch_predictor": False, + # energy predictor + "use_energy_embed": False, + "energy_predictor_layers": 2, + "energy_predictor_chans": 384, + "energy_predictor_kernel_size": 3, + "energy_predictor_dropout": 0.5, + "energy_embed_kernel_size": 9, + "energy_embed_dropout": 0.5, + "stop_gradient_from_energy_predictor": False, + # postnet + "postnet_layers": 5, + "postnet_chans": 512, + "postnet_filts": 5, + "postnet_dropout_rate": 0.5, + # spk emb + "spk_num": None, + "spk_embed_dim": None, + "spk_embed_integration_type": "add", + # training related + "init_type": "xavier_uniform", + "init_enc_alpha": 1.0, + "init_dec_alpha": 1.0, + # speaker classifier + "enable_speaker_classifier": False, + "hidden_sc_dim": 256, + }, + # denoiser config + denoiser_params: Dict[str, Any]={ + "in_channels": 80, + "out_channels": 80, + "kernel_size": 3, + "layers": 20, + "stacks": 5, + "residual_channels": 256, + "gate_channels": 512, + "skip_channels": 256, + "aux_channels": 256, + "dropout": 0., + "bias": True, + "use_weight_norm": False, + "init_type": "kaiming_normal", + }, + # diffusion config + diffusion_params: Dict[str, Any]={ + "num_train_timesteps": 100, + "beta_start": 0.0001, + "beta_end": 0.06, + "beta_schedule": "squaredcos_cap_v2", + "num_max_timesteps": 60, + "stretch": True, + }, ): + """Initialize DiffSinger module. + + Args: + spec_min (paddle.Tensor): The minimum value of the feature(mel) to stretch before diffusion. + spec_max (paddle.Tensor): The maximum value of the feature(mel) to stretch before diffusion. + idim (int): Dimension of the inputs (Input vocabrary size.). + odim (int): Dimension of the outputs (Acoustic feature dimension.). + use_energy_pred (bool, optional): whether use energy predictor. Defaults False. + use_postnet (bool, optional): whether use postnet. Defaults False. + note_num (int, optional): The number of note. Defaults to 300. + is_slur_num (int, optional): The number of slur. Defaults to 2. + fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module. + denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module. + diffusion_params (Dict[str, Any]): Parameter dict for diffusion module. + """ + assert check_argument_types() + super().__init__() + self.fs2 = FastSpeech2MIDI( + idim=idim, + odim=odim, + fastspeech2_params=fastspeech2_params, + note_num=note_num, + is_slur_num=is_slur_num, + use_energy_pred=use_energy_pred, + use_postnet=use_postnet, ) + denoiser = DiffNet(**denoiser_params) + self.diffusion = GaussianDiffusion( + denoiser, + **diffusion_params, + min_values=spec_min, + max_values=spec_max, ) + + def forward( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + durations: paddle.Tensor, + pitch: paddle.Tensor, + energy: paddle.Tensor, + spk_emb: paddle.Tensor=None, + spk_id: paddle.Tensor=None, + only_train_fs2: bool=True, + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Args: + text(Tensor(int64)): + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). + text_lengths(Tensor(int64)): + Batch of phone lengths of each input (B,). + speech(Tensor[float32]): + Batch of padded target features (e.g. mel) (B, Lmax, odim). + speech_lengths(Tensor(int64)): + Batch of the lengths of each target features (B,). + durations(Tensor(int64)): + Batch of padded token durations in frame (B, Tmax). + pitch(Tensor[float32]): + Batch of padded frame-averaged pitch (B, Lmax, 1). + energy(Tensor[float32]): + Batch of padded frame-averaged energy (B, Lmax, 1). + spk_emb(Tensor[float32], optional): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor[int64], optional(int64)): + Batch of speaker ids (B,) + only_train_fs2(bool): + Whether to train only the fastspeech2 module + + Returns: + + """ + # only train fastspeech2 module firstly + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + text_lengths=text_lengths, + speech=speech, + speech_lengths=speech_lengths, + durations=durations, + pitch=pitch, + energy=energy, + spk_id=spk_id, + spk_emb=spk_emb) + if only_train_fs2: + return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits + + # get the encoder output from fastspeech2 as the condition of denoiser module + cond_fs2, mel_masks = self.fs2.encoder_infer_batch( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + text_lengths=text_lengths, + speech_lengths=speech_lengths, + ds=durations, + ps=pitch, + es=energy) + cond_fs2 = cond_fs2.transpose((0, 2, 1)) + + # get the output(final mel) from diffusion module + noise_pred, noise_target = self.diffusion( + speech.transpose((0, 2, 1)), cond_fs2) + return noise_pred, noise_target, mel_masks + + def inference( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + get_mel_fs2: bool=False, ): + """Run inference + + Args: + text(Tensor(int64)): + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). + get_mel_fs2 (bool, optional): . Defaults to False. + Whether to get mel from fastspeech2 module. + + Returns: + + """ + mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur) + if get_mel_fs2: + return mel_fs2 + mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) + cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) + cond_fs2 = cond_fs2.transpose((0, 2, 1)) + noise = paddle.randn(mel_fs2.shape) + mel = self.diffusion.inference( + noise=noise, + cond=cond_fs2, + ref_x=mel_fs2, + scheduler_type="ddpm", + num_inference_steps=60) + mel = mel.transpose((0, 2, 1)) + return mel[0] + + +class DiffSingerInference(nn.Layer): + def __init__(self, normalizer, model): + super().__init__() + self.normalizer = normalizer + self.acoustic_model = model + + def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False): + """Calculate forward propagation. + + Args: + text(Tensor(int64)): + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). + get_mel_fs2 (bool, optional): . Defaults to False. + Whether to get mel from fastspeech2 module. + + Returns: + logmel(Tensor(float32)): denorm logmel, [T, mel_bin] + """ + normalized_mel = self.acoustic_model.inference( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=get_mel_fs2) + logmel = normalized_mel + return logmel + + +class DiffusionLoss(nn.Layer): + """Loss function module for Diffusion module on DiffSinger.""" + + def __init__(self, use_masking: bool=True, + use_weighted_masking: bool=False): + """Initialize feed-forward Transformer loss module. + Args: + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to weighted masking in loss calculation. + """ + assert check_argument_types() + super().__init__() + + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + + def forward( + self, + noise_pred: paddle.Tensor, + noise_target: paddle.Tensor, + mel_masks: paddle.Tensor, ) -> paddle.Tensor: + """Calculate forward propagation. + + Args: + noise_pred(Tensor): + Batch of outputs predict noise (B, Lmax, odim). + noise_target(Tensor): + Batch of target noise (B, Lmax, odim). + mel_masks(Tensor): + Batch of mask of real mel (B, Lmax, 1). + Returns: + + """ + # apply mask to remove padded part + if self.use_masking: + noise_pred = noise_pred.masked_select( + mel_masks.broadcast_to(noise_pred.shape)) + noise_target = noise_target.masked_select( + mel_masks.broadcast_to(noise_target.shape)) + + # calculate loss + l1_loss = self.l1_criterion(noise_pred, noise_target) + + # make weighted mask and apply it + if self.use_weighted_masking: + mel_masks = mel_masks.unsqueeze(-1) + out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast( + dtype=paddle.float32).sum( + axis=1, keepdim=True) + out_weights /= noise_target.shape[0] * noise_target.shape[2] + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select( + mel_masks.broadcast_to(l1_loss.shape)).sum() + + return l1_loss diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py new file mode 100644 index 000000000..d89b09b2a --- /dev/null +++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py @@ -0,0 +1,302 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path +from typing import Dict + +import paddle +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState + +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class DiffSingerUpdater(StandardUpdater): + def __init__(self, + model: Layer, + optimizers: Dict[str, Optimizer], + criterions: Dict[str, Layer], + dataloader: DataLoader, + ds_train_start_steps: int=160000, + output_dir: Path=None, + only_train_diffusion: bool=True): + super().__init__(model, optimizers, dataloader, init_state=None) + self.model = model._layers if isinstance(model, + paddle.DataParallel) else model + self.only_train_diffusion = only_train_diffusion + + self.optimizers = optimizers + self.optimizer_fs2: Optimizer = optimizers['fs2'] + self.optimizer_ds: Optimizer = optimizers['ds'] + + self.criterions = criterions + self.criterion_fs2 = criterions['fs2'] + self.criterion_ds = criterions['ds'] + + self.dataloader = dataloader + + self.ds_train_start_steps = ds_train_start_steps + + self.state = UpdaterState(iteration=0, epoch=0) + self.train_iterator = iter(self.dataloader) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # spk_id!=None in multiple spk diffsinger + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + # No explicit speaker identifier labels are used during voice cloning training. + if spk_emb is not None: + spk_id = None + + # only train fastspeech2 module firstly + if self.state.iteration < self.ds_train_start_steps: + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( + text=batch["text"], + note=batch["note"], + note_dur=batch["note_dur"], + is_slur=batch["is_slur"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb, + only_train_fs2=True, ) + + l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2( + after_outs=after_outs, + before_outs=before_outs, + d_outs=d_outs, + p_outs=p_outs, + e_outs=e_outs, + ys=ys, + ds=batch["durations"], + ps=batch["pitch"], + es=batch["energy"], + ilens=batch["text_lengths"], + olens=olens, + spk_logits=spk_logits, + spk_ids=spk_id, ) + + loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss + + self.optimizer_fs2.clear_grad() + loss_fs2.backward() + self.optimizer_fs2.step() + + report("train/loss_fs2", float(loss_fs2)) + report("train/l1_loss_fs2", float(l1_loss_fs2)) + report("train/ssim_loss_fs2", float(ssim_loss_fs2)) + report("train/duration_loss", float(duration_loss)) + report("train/pitch_loss", float(pitch_loss)) + + losses_dict["l1_loss_fs2"] = float(l1_loss_fs2) + losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2) + losses_dict["duration_loss"] = float(duration_loss) + losses_dict["pitch_loss"] = float(pitch_loss) + + if speaker_loss != 0.: + report("train/speaker_loss", float(speaker_loss)) + losses_dict["speaker_loss"] = float(speaker_loss) + if energy_loss != 0.: + report("train/energy_loss", float(energy_loss)) + losses_dict["energy_loss"] = float(energy_loss) + + losses_dict["loss_fs2"] = float(loss_fs2) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + # Then only train diffusion module, freeze fastspeech2 parameters. + if self.state.iteration > self.ds_train_start_steps: + for param in self.model.fs2.parameters(): + param.trainable = False if self.only_train_diffusion else True + + noise_pred, noise_target, mel_masks = self.model( + text=batch["text"], + note=batch["note"], + note_dur=batch["note_dur"], + is_slur=batch["is_slur"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb, + only_train_fs2=False, ) + + noise_pred = noise_pred.transpose((0, 2, 1)) + noise_target = noise_target.transpose((0, 2, 1)) + mel_masks = mel_masks.transpose((0, 2, 1)) + l1_loss_ds = self.criterion_ds( + noise_pred=noise_pred, + noise_target=noise_target, + mel_masks=mel_masks, ) + + loss_ds = l1_loss_ds + + self.optimizer_ds.clear_grad() + loss_ds.backward() + self.optimizer_ds.step() + + report("train/loss_ds", float(loss_ds)) + report("train/l1_loss_ds", float(l1_loss_ds)) + losses_dict["l1_loss_ds"] = float(l1_loss_ds) + losses_dict["loss_ds"] = float(loss_ds) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + self.logger.info(self.msg) + + +class DiffSingerEvaluator(StandardEvaluator): + def __init__( + self, + model: Layer, + criterions: Dict[str, Layer], + dataloader: DataLoader, + output_dir: Path=None, ): + super().__init__(model, dataloader) + self.model = model._layers if isinstance(model, + paddle.DataParallel) else model + + self.criterions = criterions + self.criterion_fs2 = criterions['fs2'] + self.criterion_ds = criterions['ds'] + self.dataloader = dataloader + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # spk_id!=None in multiple spk diffsinger + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + # Here show fastspeech2 eval + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( + text=batch["text"], + note=batch["note"], + note_dur=batch["note_dur"], + is_slur=batch["is_slur"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb, + only_train_fs2=True, ) + + l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2( + after_outs=after_outs, + before_outs=before_outs, + d_outs=d_outs, + p_outs=p_outs, + e_outs=e_outs, + ys=ys, + ds=batch["durations"], + ps=batch["pitch"], + es=batch["energy"], + ilens=batch["text_lengths"], + olens=olens, + spk_logits=spk_logits, + spk_ids=spk_id, ) + + loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss + + report("eval/loss_fs2", float(loss_fs2)) + report("eval/l1_loss_fs2", float(l1_loss_fs2)) + report("eval/ssim_loss_fs2", float(ssim_loss_fs2)) + report("eval/duration_loss", float(duration_loss)) + report("eval/pitch_loss", float(pitch_loss)) + + losses_dict["l1_loss_fs2"] = float(l1_loss_fs2) + losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2) + losses_dict["duration_loss"] = float(duration_loss) + losses_dict["pitch_loss"] = float(pitch_loss) + + if speaker_loss != 0.: + report("eval/speaker_loss", float(speaker_loss)) + losses_dict["speaker_loss"] = float(speaker_loss) + if energy_loss != 0.: + report("eval/energy_loss", float(energy_loss)) + losses_dict["energy_loss"] = float(energy_loss) + + losses_dict["loss_fs2"] = float(loss_fs2) + + # Here show diffusion eval + noise_pred, noise_target, mel_masks = self.model( + text=batch["text"], + note=batch["note"], + note_dur=batch["note_dur"], + is_slur=batch["is_slur"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb, + only_train_fs2=False, ) + + noise_pred = noise_pred.transpose((0, 2, 1)) + noise_target = noise_target.transpose((0, 2, 1)) + mel_masks = mel_masks.transpose((0, 2, 1)) + l1_loss_ds = self.criterion_ds( + noise_pred=noise_pred, + noise_target=noise_target, + mel_masks=mel_masks, ) + + loss_ds = l1_loss_ds + + report("eval/loss_ds", float(loss_ds)) + report("eval/l1_loss_ds", float(l1_loss_ds)) + losses_dict["l1_loss_ds"] = float(l1_loss_ds) + losses_dict["loss_ds"] = float(loss_ds) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py new file mode 100644 index 000000000..cce88d8a0 --- /dev/null +++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py @@ -0,0 +1,654 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +from typing import Any +from typing import Dict +from typing import Sequence +from typing import Tuple + +import paddle +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss +from paddlespeech.t2s.modules.losses import ssim +from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask +from paddlespeech.t2s.modules.nets_utils import make_pad_mask + + +class FastSpeech2MIDI(FastSpeech2): + """The Fastspeech2 module of DiffSinger. + """ + + def __init__( + self, + # fastspeech2 network structure related + idim: int, + odim: int, + fastspeech2_params: Dict[str, Any], + # note emb + note_num: int=300, + # is_slur emb + is_slur_num: int=2, + use_energy_pred: bool=False, + use_postnet: bool=False, ): + """Initialize FastSpeech2 module for svs. + Args: + fastspeech2_params (Dict): + The config of FastSpeech2 module on DiffSinger model + note_num (Optional[int]): + Number of note. If not None, assume that the + note_ids will be provided as the input and use note_embedding_table. + is_slur_num (Optional[int]): + Number of note. If not None, assume that the + is_slur_ids will be provided as the input + + """ + assert check_argument_types() + super().__init__(idim=idim, odim=odim, **fastspeech2_params) + self.use_energy_pred = use_energy_pred + self.use_postnet = use_postnet + if not self.use_postnet: + self.postnet = None + + self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[ + "adim"] + + # note_ embed + self.note_embedding_table = nn.Embedding( + num_embeddings=note_num, + embedding_dim=self.note_embed_dim, + padding_idx=self.padding_idx) + self.note_dur_layer = nn.Linear(1, self.note_embed_dim) + + # slur embed + self.is_slur_embedding_table = nn.Embedding( + num_embeddings=is_slur_num, + embedding_dim=self.is_slur_embed_dim, + padding_idx=self.padding_idx) + + def forward( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + durations: paddle.Tensor, + pitch: paddle.Tensor, + energy: paddle.Tensor, + spk_emb: paddle.Tensor=None, + spk_id: paddle.Tensor=None, + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Args: + text(Tensor(int64)): + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). + text_lengths(Tensor(int64)): + Batch of phone lengths of each input (B,). + speech(Tensor[float32]): + Batch of padded target features (e.g. mel) (B, Lmax, odim). + speech_lengths(Tensor(int64)): + Batch of the lengths of each target features (B,). + durations(Tensor(int64)): + Batch of padded token durations in frame (B, Tmax). + pitch(Tensor[float32]): + Batch of padded frame-averaged pitch (B, Lmax, 1). + energy(Tensor[float32]): + Batch of padded frame-averaged energy (B, Lmax, 1). + spk_emb(Tensor[float32], optional): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor[int64], optional(int64)): + Batch of speaker ids (B,) + + Returns: + + """ + xs = paddle.cast(text, 'int64') + note = paddle.cast(note, 'int64') + note_dur = paddle.cast(note_dur, 'float32') + is_slur = paddle.cast(is_slur, 'int64') + ilens = paddle.cast(text_lengths, 'int64') + olens = paddle.cast(speech_lengths, 'int64') + ds = paddle.cast(durations, 'int64') + ps = pitch + es = energy + ys = speech + olens = speech_lengths + if spk_id is not None: + spk_id = paddle.cast(spk_id, 'int64') + # forward propagation + before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward( + xs=xs, + note=note, + note_dur=note_dur, + is_slur=is_slur, + ilens=ilens, + olens=olens, + ds=ds, + ps=ps, + es=es, + is_inference=False, + spk_emb=spk_emb, + spk_id=spk_id, ) + # modify mod part of groundtruth + if self.reduction_factor > 1: + olens = olens - olens % self.reduction_factor + max_olen = max(olens) + ys = ys[:, :max_olen] + + return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits + + def _forward( + self, + xs: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + ilens: paddle.Tensor, + olens: paddle.Tensor=None, + ds: paddle.Tensor=None, + ps: paddle.Tensor=None, + es: paddle.Tensor=None, + is_inference: bool=False, + is_train_diffusion: bool=False, + return_after_enc=False, + alpha: float=1.0, + spk_emb=None, + spk_id=None, ) -> Sequence[paddle.Tensor]: + + before_outs = after_outs = d_outs = p_outs = e_outs = spk_logits = None + # forward encoder + masks = self._source_mask(ilens) + note_emb = self.note_embedding_table(note) + note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1)) + is_slur_emb = self.is_slur_embedding_table(is_slur) + + # (B, Tmax, adim) + hs, _ = self.encoder( + xs=xs, + masks=masks, + note_emb=note_emb, + note_dur_emb=note_dur_emb, + is_slur_emb=is_slur_emb, ) + + if self.spk_num and self.enable_speaker_classifier and not is_inference: + hs_for_spk_cls = self.grad_reverse(hs) + spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens) + else: + spk_logits = None + + # integrate speaker embedding + if self.spk_embed_dim is not None: + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + elif spk_id is not None: + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) + + # forward duration predictor (phone-level) and variance predictors (frame-level) + d_masks = make_pad_mask(ilens) + if olens is not None: + pitch_masks = make_pad_mask(olens).unsqueeze(-1) + else: + pitch_masks = None + + # inference for decoder input for diffusion + if is_train_diffusion: + hs = self.length_regulator(hs, ds, is_inference=False) + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs += p_embs + if self.use_energy_pred: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + e_embs = self.energy_embed( + e_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + hs += e_embs + + elif is_inference: + # (B, Tmax) + if ds is not None: + d_outs = ds + else: + d_outs = self.duration_predictor.inference(hs, d_masks) + + # (B, Lmax, adim) + hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) + + if ps is not None: + p_outs = ps + else: + if self.stop_gradient_from_pitch_predictor: + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + else: + p_outs = self.pitch_predictor(hs, pitch_masks) + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs += p_embs + + if self.use_energy_pred: + if es is not None: + e_outs = es + else: + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + else: + e_outs = self.energy_predictor(hs, pitch_masks) + e_embs = self.energy_embed( + e_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + hs += e_embs + + # training + else: + d_outs = self.duration_predictor(hs, d_masks) + # (B, Lmax, adim) + hs = self.length_regulator(hs, ds, is_inference=False) + if self.stop_gradient_from_pitch_predictor: + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + else: + p_outs = self.pitch_predictor(hs, pitch_masks) + p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs += p_embs + + if self.use_energy_pred: + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + else: + e_outs = self.energy_predictor(hs, pitch_masks) + e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs += e_embs + + # forward decoder + if olens is not None and not is_inference: + if self.reduction_factor > 1: + olens_in = paddle.to_tensor( + [olen // self.reduction_factor for olen in olens.numpy()]) + else: + olens_in = olens + # (B, 1, T) + h_masks = self._source_mask(olens_in) + else: + h_masks = None + + if return_after_enc: + return hs, h_masks + + if self.decoder_type == 'cnndecoder': + # remove output masks for dygraph to static graph + zs = self.decoder(hs, h_masks) + before_outs = zs + else: + # (B, Lmax, adim) + zs, _ = self.decoder(hs, h_masks) + # (B, Lmax, odim) + before_outs = self.feat_out(zs).reshape( + (paddle.shape(zs)[0], -1, self.odim)) + + # postnet -> (B, Lmax//r * r, odim) + if self.postnet is None: + after_outs = before_outs + else: + after_outs = before_outs + self.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + + return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits + + def encoder_infer( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + alpha: float=1.0, + spk_emb=None, + spk_id=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + xs = paddle.cast(text, 'int64').unsqueeze(0) + note = paddle.cast(note, 'int64').unsqueeze(0) + note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0) + is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0) + # setup batch axis + ilens = paddle.shape(xs)[1] + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + # (1, L, odim) + # use *_ to avoid bug in dygraph to static graph + hs, _ = self._forward( + xs=xs, + note=note, + note_dur=note_dur, + is_slur=is_slur, + ilens=ilens, + is_inference=True, + return_after_enc=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, ) + return hs + + # get encoder output for diffusion training + def encoder_infer_batch( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech_lengths: paddle.Tensor, + ds: paddle.Tensor=None, + ps: paddle.Tensor=None, + es: paddle.Tensor=None, + alpha: float=1.0, + spk_emb=None, + spk_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + + xs = paddle.cast(text, 'int64') + note = paddle.cast(note, 'int64') + note_dur = paddle.cast(note_dur, 'float32') + is_slur = paddle.cast(is_slur, 'int64') + ilens = paddle.cast(text_lengths, 'int64') + olens = paddle.cast(speech_lengths, 'int64') + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + # (1, L, odim) + # use *_ to avoid bug in dygraph to static graph + hs, h_masks = self._forward( + xs=xs, + note=note, + note_dur=note_dur, + is_slur=is_slur, + ilens=ilens, + olens=olens, + ds=ds, + ps=ps, + es=es, + return_after_enc=True, + is_train_diffusion=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, ) + return hs, h_masks + + def inference( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + durations: paddle.Tensor=None, + pitch: paddle.Tensor=None, + energy: paddle.Tensor=None, + alpha: float=1.0, + use_teacher_forcing: bool=False, + spk_emb=None, + spk_id=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """Generate the sequence of features given the sequences of characters. + + Args: + text(Tensor(int64)): + Input sequence of characters (T,). + note(Tensor(int64)): + Input note (element in music score) ids (T,). + note_dur(Tensor(float32)): + Input note durations in seconds (element in music score) (T,). + is_slur(Tensor(int64)): + Input slur (element in music score) ids (T,). + durations(Tensor, optional (int64)): + Groundtruth of duration (T,). + pitch(Tensor, optional): + Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): + Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): + Alpha to control the speed. + use_teacher_forcing(bool, optional): + Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. + spk_emb(Tensor, optional, optional): + peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): + spk ids (1,). (Default value = None) + + Returns: + + """ + xs = paddle.cast(text, 'int64').unsqueeze(0) + note = paddle.cast(note, 'int64').unsqueeze(0) + note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0) + is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0) + d, p, e = durations, pitch, energy + # setup batch axis + ilens = paddle.shape(xs)[1] + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + if use_teacher_forcing: + # use groundtruth of duration, pitch, and energy + ds = d.unsqueeze(0) if d is not None else None + ps = p.unsqueeze(0) if p is not None else None + es = e.unsqueeze(0) if e is not None else None + + # (1, L, odim) + _, outs, d_outs, p_outs, e_outs, _ = self._forward( + xs=xs, + note=note, + note_dur=note_dur, + is_slur=is_slur, + ilens=ilens, + ds=ds, + ps=ps, + es=es, + spk_emb=spk_emb, + spk_id=spk_id, + is_inference=True) + else: + # (1, L, odim) + _, outs, d_outs, p_outs, e_outs, _ = self._forward( + xs=xs, + note=note, + note_dur=note_dur, + is_slur=is_slur, + ilens=ilens, + is_inference=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, ) + + if e_outs is None: + e_outs = [None] + + return outs[0], d_outs[0], p_outs[0], e_outs[0] + + +class FastSpeech2MIDILoss(FastSpeech2Loss): + """Loss function module for DiffSinger.""" + + def __init__(self, use_masking: bool=True, + use_weighted_masking: bool=False): + """Initialize feed-forward Transformer loss module. + Args: + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to weighted masking in loss calculation. + """ + assert check_argument_types() + super().__init__(use_masking, use_weighted_masking) + + def forward( + self, + after_outs: paddle.Tensor, + before_outs: paddle.Tensor, + d_outs: paddle.Tensor, + p_outs: paddle.Tensor, + e_outs: paddle.Tensor, + ys: paddle.Tensor, + ds: paddle.Tensor, + ps: paddle.Tensor, + es: paddle.Tensor, + ilens: paddle.Tensor, + olens: paddle.Tensor, + spk_logits: paddle.Tensor=None, + spk_ids: paddle.Tensor=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, + paddle.Tensor, ]: + """Calculate forward propagation. + + Args: + after_outs(Tensor): + Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): + Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): + Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): + Batch of outputs of pitch predictor (B, Lmax, 1). + e_outs(Tensor): + Batch of outputs of energy predictor (B, Lmax, 1). + ys(Tensor): + Batch of target features (B, Lmax, odim). + ds(Tensor): + Batch of durations (B, Tmax). + ps(Tensor): + Batch of target frame-averaged pitch (B, Lmax, 1). + es(Tensor): + Batch of target frame-averaged energy (B, Lmax, 1). + ilens(Tensor): + Batch of the lengths of each input (B,). + olens(Tensor): + Batch of the lengths of each target (B,). + spk_logits(Option[Tensor]): + Batch of outputs after speaker classifier (B, Lmax, num_spk) + spk_ids(Option[Tensor]): + Batch of target spk_id (B,) + + + Returns: + + + """ + l1_loss = duration_loss = pitch_loss = energy_loss = speaker_loss = ssim_loss = 0.0 + + # apply mask to remove padded part + if self.use_masking: + # make feature for ssim loss + out_pad_masks = make_pad_mask(olens).unsqueeze(-1) + before_outs_ssim = masked_fill(before_outs, out_pad_masks, 0.0) + if not paddle.equal_all(after_outs, before_outs): + after_outs_ssim = masked_fill(after_outs, out_pad_masks, 0.0) + ys_ssim = masked_fill(ys, out_pad_masks, 0.0) + + out_masks = make_non_pad_mask(olens).unsqueeze(-1) + before_outs = before_outs.masked_select( + out_masks.broadcast_to(before_outs.shape)) + if not paddle.equal_all(after_outs, before_outs): + after_outs = after_outs.masked_select( + out_masks.broadcast_to(after_outs.shape)) + ys = ys.masked_select(out_masks.broadcast_to(ys.shape)) + duration_masks = make_non_pad_mask(ilens) + d_outs = d_outs.masked_select( + duration_masks.broadcast_to(d_outs.shape)) + ds = ds.masked_select(duration_masks.broadcast_to(ds.shape)) + pitch_masks = out_masks + p_outs = p_outs.masked_select( + pitch_masks.broadcast_to(p_outs.shape)) + ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape)) + if e_outs is not None: + e_outs = e_outs.masked_select( + pitch_masks.broadcast_to(e_outs.shape)) + es = es.masked_select(pitch_masks.broadcast_to(es.shape)) + + if spk_logits is not None and spk_ids is not None: + batch_size = spk_ids.shape[0] + spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1], + None) + spk_logits = paddle.reshape(spk_logits, + [-1, spk_logits.shape[-1]]) + mask_index = spk_logits.abs().sum(axis=1) != 0 + spk_ids = spk_ids[mask_index] + spk_logits = spk_logits[mask_index] + + # calculate loss + l1_loss = self.l1_criterion(before_outs, ys) + ssim_loss = 1.0 - ssim( + before_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1)) + if not paddle.equal_all(after_outs, before_outs): + l1_loss += self.l1_criterion(after_outs, ys) + ssim_loss += ( + 1.0 - ssim(after_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1))) + l1_loss = l1_loss * 0.5 + ssim_loss = ssim_loss * 0.5 + + duration_loss = self.duration_criterion(d_outs, ds) + pitch_loss = self.l1_criterion(p_outs, ps) + if e_outs is not None: + energy_loss = self.l1_criterion(e_outs, es) + + if spk_logits is not None and spk_ids is not None: + speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size + + # make weighted mask and apply it + if self.use_weighted_masking: + out_masks = make_non_pad_mask(olens).unsqueeze(-1) + out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast( + dtype=paddle.float32).sum( + axis=1, keepdim=True) + out_weights /= ys.shape[0] * ys.shape[2] + duration_masks = make_non_pad_mask(ilens) + duration_weights = (duration_masks.cast(dtype=paddle.float32) / + duration_masks.cast(dtype=paddle.float32).sum( + axis=1, keepdim=True)) + duration_weights /= ds.shape[0] + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select( + out_masks.broadcast_to(l1_loss.shape)).sum() + ssim_loss = ssim_loss.multiply(out_weights) + ssim_loss = ssim_loss.masked_select( + out_masks.broadcast_to(ssim_loss.shape)).sum() + duration_loss = (duration_loss.multiply(duration_weights) + .masked_select(duration_masks).sum()) + pitch_masks = out_masks + pitch_weights = out_weights + pitch_loss = pitch_loss.multiply(pitch_weights) + pitch_loss = pitch_loss.masked_select( + pitch_masks.broadcast_to(pitch_loss.shape)).sum() + if e_outs is not None: + energy_loss = energy_loss.multiply(pitch_weights) + energy_loss = energy_loss.masked_select( + pitch_masks.broadcast_to(energy_loss.shape)).sum() + + return l1_loss, ssim_loss, duration_loss, pitch_loss, energy_loss, speaker_loss diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index c790c8cb2..8ce19795e 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -93,6 +93,7 @@ class FastSpeech2(nn.Layer): transformer_dec_dropout_rate: float=0.1, transformer_dec_positional_dropout_rate: float=0.1, transformer_dec_attn_dropout_rate: float=0.1, + transformer_activation_type: str="relu", # for conformer conformer_pos_enc_layer_type: str="rel_pos", conformer_self_attn_layer_type: str="rel_selfattn", @@ -200,6 +201,8 @@ class FastSpeech2(nn.Layer): Dropout rate after decoder positional encoding. transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module. + transformer_activation_type (str): + Activation function type in transformer. conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer. conformer_self_attn_layer_type (str): @@ -250,7 +253,7 @@ class FastSpeech2(nn.Layer): Kernel size of energy embedding. energy_embed_dropout_rate (float): Dropout rate for energy embedding. - stop_gradient_from_energy_predictor(bool): + stop_gradient_from_energy_predictor (bool): Whether to stop gradient from energy predictor to encoder. spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, @@ -269,7 +272,7 @@ class FastSpeech2(nn.Layer): How to integrate tone embedding. init_type (str): How to initialize transformer parameters. - init_enc_alpha (float): + init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. @@ -344,7 +347,8 @@ class FastSpeech2(nn.Layer): normalize_before=encoder_normalize_before, concat_after=encoder_concat_after, positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + activation_type=transformer_activation_type) elif encoder_type == "conformer": self.encoder = ConformerEncoder( idim=idim, @@ -453,7 +457,8 @@ class FastSpeech2(nn.Layer): normalize_before=decoder_normalize_before, concat_after=decoder_concat_after, positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + activation_type=conformer_activation_type, ) elif decoder_type == "conformer": self.decoder = ConformerEncoder( idim=0, diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py index 71b9753c8..5901c805a 100644 --- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py +++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py @@ -11,8 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import random - import paddle import paddle.nn.functional as F import paddleaudio.functional as audio_F @@ -46,7 +44,8 @@ class LinearNorm(nn.Layer): self.linear_layer.weight, gain=_calculate_gain(w_init_gain)) def forward(self, x: paddle.Tensor): - return self.linear_layer(x) + out = self.linear_layer(x) + return out class ConvNorm(nn.Layer): @@ -82,85 +81,6 @@ class ConvNorm(nn.Layer): return conv_signal -class CausualConv(nn.Layer): - def __init__(self, - in_channels: int, - out_channels: int, - kernel_size: int=1, - stride: int=1, - padding: int=1, - dilation: int=1, - bias: bool=True, - w_init_gain: str='linear', - param=None): - super().__init__() - if padding is None: - assert (kernel_size % 2 == 1) - padding = int(dilation * (kernel_size - 1) / 2) * 2 - else: - self.padding = padding * 2 - self.conv = nn.Conv1D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - dilation=dilation, - bias_attr=bias) - - xavier_uniform_( - self.conv.weight, gain=_calculate_gain(w_init_gain, param=param)) - - def forward(self, x: paddle.Tensor): - x = self.conv(x) - x = x[:, :, :-self.padding] - return x - - -class CausualBlock(nn.Layer): - def __init__(self, - hidden_dim: int, - n_conv: int=3, - dropout_p: float=0.2, - activ: str='lrelu'): - super().__init__() - self.blocks = nn.LayerList([ - self._get_conv( - hidden_dim=hidden_dim, - dilation=3**i, - activ=activ, - dropout_p=dropout_p) for i in range(n_conv) - ]) - - def forward(self, x): - for block in self.blocks: - res = x - x = block(x) - x += res - return x - - def _get_conv(self, - hidden_dim: int, - dilation: int, - activ: str='lrelu', - dropout_p: float=0.2): - layers = [ - CausualConv( - in_channels=hidden_dim, - out_channels=hidden_dim, - kernel_size=3, - padding=dilation, - dilation=dilation), _get_activation_fn(activ), - nn.BatchNorm1D(hidden_dim), nn.Dropout(p=dropout_p), CausualConv( - in_channels=hidden_dim, - out_channels=hidden_dim, - kernel_size=3, - padding=1, - dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p) - ] - return nn.Sequential(*layers) - - class ConvBlock(nn.Layer): def __init__(self, hidden_dim: int, @@ -264,13 +184,14 @@ class Attention(nn.Layer): """ Args: query: - decoder output (batch, n_mel_channels * n_frames_per_step) + decoder output (B, n_mel_channels * n_frames_per_step) processed_memory: processed encoder outputs (B, T_in, attention_dim) attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) Returns: - Tensor: alignment (batch, max_time) + Tensor: + alignment (B, max_time) """ processed_query = self.query_layer(query.unsqueeze(1)) @@ -316,144 +237,6 @@ class Attention(nn.Layer): return attention_context, attention_weights -class ForwardAttentionV2(nn.Layer): - def __init__(self, - attention_rnn_dim: int, - embedding_dim: int, - attention_dim: int, - attention_location_n_filters: int, - attention_location_kernel_size: int): - super().__init__() - self.query_layer = LinearNorm( - in_dim=attention_rnn_dim, - out_dim=attention_dim, - bias=False, - w_init_gain='tanh') - self.memory_layer = LinearNorm( - in_dim=embedding_dim, - out_dim=attention_dim, - bias=False, - w_init_gain='tanh') - self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False) - self.location_layer = LocationLayer( - attention_n_filters=attention_location_n_filters, - attention_kernel_size=attention_location_kernel_size, - attention_dim=attention_dim) - self.score_mask_value = -float(1e20) - - def get_alignment_energies(self, - query: paddle.Tensor, - processed_memory: paddle.Tensor, - attention_weights_cat: paddle.Tensor): - """ - Args: - query: - decoder output (batch, n_mel_channels * n_frames_per_step) - processed_memory: - processed encoder outputs (B, T_in, attention_dim) - attention_weights_cat: - prev. and cumulative att weights (B, 2, max_time) - Returns: - Tensor: alignment (batch, max_time) - """ - - processed_query = self.query_layer(query.unsqueeze(1)) - processed_attention_weights = self.location_layer(attention_weights_cat) - energies = self.v( - paddle.tanh(processed_query + processed_attention_weights + - processed_memory)) - - energies = energies.squeeze(-1) - return energies - - def forward(self, - attention_hidden_state: paddle.Tensor, - memory: paddle.Tensor, - processed_memory: paddle.Tensor, - attention_weights_cat: paddle.Tensor, - mask: paddle.Tensor, - log_alpha: paddle.Tensor): - """ - Args: - attention_hidden_state: - attention rnn last output - memory: - encoder outputs - processed_memory: - processed encoder outputs - attention_weights_cat: - previous and cummulative attention weights - mask: - binary mask for padded data - """ - log_energy = self.get_alignment_energies( - query=attention_hidden_state, - processed_memory=processed_memory, - attention_weights_cat=attention_weights_cat) - - if mask is not None: - log_energy[:] = paddle.where( - mask, - paddle.full(log_energy.shape, self.score_mask_value, - log_energy.dtype), log_energy) - log_alpha_shift_padded = [] - max_time = log_energy.shape[1] - for sft in range(2): - shifted = log_alpha[:, :max_time - sft] - shift_padded = F.pad(shifted, (sft, 0), 'constant', - self.score_mask_value) - log_alpha_shift_padded.append(shift_padded.unsqueeze(2)) - - biased = paddle.logsumexp(paddle.conat(log_alpha_shift_padded, 2), 2) - log_alpha_new = biased + log_energy - attention_weights = F.softmax(log_alpha_new, axis=1) - attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory) - attention_context = attention_context.squeeze(1) - - return attention_context, attention_weights, log_alpha_new - - -class PhaseShuffle2D(nn.Layer): - def __init__(self, n: int=2): - super().__init__() - self.n = n - self.random = random.Random(1) - - def forward(self, x: paddle.Tensor, move: int=None): - # x.size = (B, C, M, L) - if move is None: - move = self.random.randint(-self.n, self.n) - - if move == 0: - return x - else: - left = x[:, :, :, :move] - right = x[:, :, :, move:] - shuffled = paddle.concat([right, left], axis=3) - return shuffled - - -class PhaseShuffle1D(nn.Layer): - def __init__(self, n: int=2): - super().__init__() - self.n = n - self.random = random.Random(1) - - def forward(self, x: paddle.Tensor, move: int=None): - # x.size = (B, C, M, L) - if move is None: - move = self.random.randint(-self.n, self.n) - - if move == 0: - return x - else: - left = x[:, :, :move] - right = x[:, :, move:] - shuffled = paddle.concat([right, left], axis=2) - - return shuffled - - class MFCC(nn.Layer): def __init__(self, n_mfcc: int=40, n_mels: int=80): super().__init__() @@ -473,7 +256,6 @@ class MFCC(nn.Layer): # -> (channel, time, n_mfcc).tranpose(...) mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]), self.dct_mat).transpose([0, 2, 1]) - # unpack batch if unsqueezed: mfcc = mfcc.squeeze(0) diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py index 48de8af1f..251974572 100644 --- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py +++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py @@ -99,7 +99,7 @@ class ASRCNN(nn.Layer): unmask_futre_steps (int): unmasking future step size. Return: - mask (paddle.BoolTensor): + Tensor (paddle.Tensor(bool)): mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False """ index_tensor = paddle.arange(out_length).unsqueeze(0).expand( @@ -194,9 +194,8 @@ class ASRS2S(nn.Layer): logit_outputs += [logit] alignments += [attention_weights] - hidden_outputs, logit_outputs, alignments = \ - self.parse_decoder_outputs( - hidden_outputs, logit_outputs, alignments) + hidden_outputs, logit_outputs, alignments = self.parse_decoder_outputs( + hidden_outputs, logit_outputs, alignments) return hidden_outputs, logit_outputs, alignments diff --git a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py index 118b8f0e2..5938e6a7c 100644 --- a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py +++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py @@ -33,10 +33,9 @@ class JDCNet(nn.Layer): super().__init__() self.seq_len = seq_len self.num_class = num_class - - # input = (b, 1, 31, 513), b = batch size + # input: (B, num_class, T, n_mels) self.conv_block = nn.Sequential( - # out: (b, 64, 31, 513) + # output: (B, out_channels, T, n_mels) nn.Conv2D( in_channels=1, out_channels=64, @@ -45,127 +44,99 @@ class JDCNet(nn.Layer): bias_attr=False), nn.BatchNorm2D(num_features=64), nn.LeakyReLU(leaky_relu_slope), - # (b, 64, 31, 513) + # out: (B, out_channels, T, n_mels) nn.Conv2D(64, 64, 3, padding=1, bias_attr=False), ) - - # res blocks - # (b, 128, 31, 128) + # output: (B, out_channels, T, n_mels // 2) self.res_block1 = ResBlock(in_channels=64, out_channels=128) - # (b, 192, 31, 32) + # output: (B, out_channels, T, n_mels // 4) self.res_block2 = ResBlock(in_channels=128, out_channels=192) - # (b, 256, 31, 8) + # output: (B, out_channels, T, n_mels // 8) self.res_block3 = ResBlock(in_channels=192, out_channels=256) - # pool block self.pool_block = nn.Sequential( nn.BatchNorm2D(num_features=256), nn.LeakyReLU(leaky_relu_slope), - # (b, 256, 31, 2) + # (B, num_features, T, 2) nn.MaxPool2D(kernel_size=(1, 4)), nn.Dropout(p=0.5), ) - - # maxpool layers (for auxiliary network inputs) - # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2) - self.maxpool1 = nn.MaxPool2D(kernel_size=(1, 40)) - # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2) - self.maxpool2 = nn.MaxPool2D(kernel_size=(1, 20)) - # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2) - self.maxpool3 = nn.MaxPool2D(kernel_size=(1, 10)) - - # in = (b, 640, 31, 2), out = (b, 256, 31, 2) - self.detector_conv = nn.Sequential( - nn.Conv2D( - in_channels=640, - out_channels=256, - kernel_size=1, - bias_attr=False), - nn.BatchNorm2D(256), - nn.LeakyReLU(leaky_relu_slope), - nn.Dropout(p=0.5), ) - - # input: (b, 31, 512) - resized from (b, 256, 31, 2) - # output: (b, 31, 512) + # input: (B, T, input_size), resized from (B, input_size // 2, T, 2) + # output: (B, T, input_size) self.bilstm_classifier = nn.LSTM( input_size=512, hidden_size=256, time_major=False, direction='bidirectional') - - # input: (b, 31, 512) - resized from (b, 256, 31, 2) - # output: (b, 31, 512) - self.bilstm_detector = nn.LSTM( - input_size=512, - hidden_size=256, - time_major=False, - direction='bidirectional') - - # input: (b * 31, 512) - # output: (b * 31, num_class) + # input: (B * T, in_features) + # output: (B * T, num_class) self.classifier = nn.Linear( in_features=512, out_features=self.num_class) - # input: (b * 31, 512) - # output: (b * 31, 2) - binary classifier - self.detector = nn.Linear(in_features=512, out_features=2) - # initialize weights self.apply(self.init_weights) def get_feature_GAN(self, x: paddle.Tensor): - seq_len = x.shape[-2] - x = x.astype(paddle.float32).transpose([0, 1, 3, 2] if len(x.shape) == 4 - else [0, 2, 1]) - + """Calculate feature_GAN. + Args: + x(Tensor(float32)): + Shape (B, num_class, n_mels, T). + Returns: + Tensor: + Shape (B, num_features, n_mels // 8, T). + """ + x = x.astype(paddle.float32) + x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else [0, 2, 1]) convblock_out = self.conv_block(x) - resblock1_out = self.res_block1(convblock_out) resblock2_out = self.res_block2(resblock1_out) resblock3_out = self.res_block3(resblock2_out) poolblock_out = self.pool_block[0](resblock3_out) poolblock_out = self.pool_block[1](poolblock_out) - - return poolblock_out.transpose([0, 1, 3, 2] if len(poolblock_out.shape) - == 4 else [0, 2, 1]) + GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len( + poolblock_out.shape) == 4 else [0, 2, 1]) + return GAN_feature def forward(self, x: paddle.Tensor): - """ + """Calculate forward propagation. + Args: + x(Tensor(float32)): + Shape (B, num_class, n_mels, seq_len). Returns: - classification_prediction, detection_prediction - sizes: (b, 31, 722), (b, 31, 2) + Tensor: + classifier output consists of predicted pitch classes per frame. + Shape: (B, seq_len, num_class). + Tensor: + GAN_feature. Shape: (B, num_features, n_mels // 8, seq_len) + Tensor: + poolblock_out. Shape (B, seq_len, 512) """ ############################### # forward pass for classifier # ############################### + # (B, num_class, n_mels, T) -> (B, num_class, T, n_mels) x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else [0, 2, 1]).astype(paddle.float32) convblock_out = self.conv_block(x) - resblock1_out = self.res_block1(convblock_out) resblock2_out = self.res_block2(resblock1_out) resblock3_out = self.res_block3(resblock2_out) - poolblock_out = self.pool_block[0](resblock3_out) poolblock_out = self.pool_block[1](poolblock_out) GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len( poolblock_out.shape) == 4 else [0, 2, 1]) poolblock_out = self.pool_block[2](poolblock_out) - - # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512) + # (B, 256, seq_len, 2) => (B, seq_len, 256, 2) => (B, seq_len, 512) classifier_out = poolblock_out.transpose([0, 2, 1, 3]).reshape( (-1, self.seq_len, 512)) self.bilstm_classifier.flatten_parameters() - classifier_out, _ = self.bilstm_classifier( - classifier_out) # ignore the hidden states - - classifier_out = classifier_out.reshape((-1, 512)) # (b * 31, 512) + # ignore the hidden states + classifier_out, _ = self.bilstm_classifier(classifier_out) + # (B * seq_len, 512) + classifier_out = classifier_out.reshape((-1, 512)) classifier_out = self.classifier(classifier_out) + # (B, seq_len, num_class) classifier_out = classifier_out.reshape( - (-1, self.seq_len, self.num_class)) # (b, 31, num_class) - - # sizes: (b, 31, 722), (b, 31, 2) - # classifier output consists of predicted pitch classes per frame - # detector output consists of: (isvoice, notvoice) estimates per frame + (-1, self.seq_len, self.num_class)) return paddle.abs(classifier_out.squeeze()), GAN_feature, poolblock_out @staticmethod @@ -188,10 +159,9 @@ class ResBlock(nn.Layer): def __init__(self, in_channels: int, out_channels: int, - leaky_relu_slope=0.01): + leaky_relu_slope: float=0.01): super().__init__() self.downsample = in_channels != out_channels - # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper self.pre_conv = nn.Sequential( nn.BatchNorm2D(num_features=in_channels), @@ -215,7 +185,6 @@ class ResBlock(nn.Layer): kernel_size=3, padding=1, bias_attr=False), ) - # 1 x 1 convolution layer to match the feature dimensions self.conv1by1 = None if self.downsample: @@ -226,6 +195,13 @@ class ResBlock(nn.Layer): bias_attr=False) def forward(self, x: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): Shape (B, in_channels, T, n_mels). + Returns: + Tensor: + The residual output, Shape (B, out_channels, T, n_mels // 2). + """ x = self.pre_conv(x) if self.downsample: x = self.conv(x) + self.conv1by1(x) diff --git a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py index 96e9eda81..2a96b30c6 100644 --- a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py +++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py @@ -19,31 +19,36 @@ This work is licensed under the Creative Commons Attribution-NonCommercial http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. """ -# import copy import math import paddle import paddle.nn.functional as F from paddle import nn -from paddlespeech.utils.initialize import _calculate_gain -from paddlespeech.utils.initialize import xavier_uniform_ - -# from munch import Munch - class DownSample(nn.Layer): def __init__(self, layer_type: str): super().__init__() self.layer_type = layer_type - def forward(self, x): + def forward(self, x: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): Shape (B, dim_in, n_mels, T). + Returns: + Tensor: + layer_type == 'none': Shape (B, dim_in, n_mels, T) + layer_type == 'timepreserve': Shape (B, dim_in, n_mels // 2, T) + layer_type == 'half': Shape (B, dim_in, n_mels // 2, T // 2) + """ if self.layer_type == 'none': return x elif self.layer_type == 'timepreserve': - return F.avg_pool2d(x, (2, 1)) + out = F.avg_pool2d(x, (2, 1)) + return out elif self.layer_type == 'half': - return F.avg_pool2d(x, 2) + out = F.avg_pool2d(x, 2) + return out else: raise RuntimeError( 'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' @@ -55,13 +60,24 @@ class UpSample(nn.Layer): super().__init__() self.layer_type = layer_type - def forward(self, x): + def forward(self, x: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): Shape (B, dim_in, n_mels, T). + Returns: + Tensor: + layer_type == 'none': Shape (B, dim_in, n_mels, T) + layer_type == 'timepreserve': Shape (B, dim_in, n_mels * 2, T) + layer_type == 'half': Shape (B, dim_in, n_mels * 2, T * 2) + """ if self.layer_type == 'none': return x elif self.layer_type == 'timepreserve': - return F.interpolate(x, scale_factor=(2, 1), mode='nearest') + out = F.interpolate(x, scale_factor=(2, 1), mode='nearest') + return out elif self.layer_type == 'half': - return F.interpolate(x, scale_factor=2, mode='nearest') + out = F.interpolate(x, scale_factor=2, mode='nearest') + return out else: raise RuntimeError( 'Got unexpected upsampletype %s, expected is [none, timepreserve, half]' @@ -127,9 +143,19 @@ class ResBlk(nn.Layer): return x def forward(self, x: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): Shape (B, dim_in, n_mels, T). + Returns: + Tensor: + downsample == 'none': Shape (B, dim_in, n_mels, T). + downsample == 'timepreserve': Shape (B, dim_out, T, n_mels // 2, T). + downsample == 'half': Shape (B, dim_out, T, n_mels // 2, T // 2). + """ x = self._shortcut(x) + self._residual(x) # unit variance - return x / math.sqrt(2) + out = x / math.sqrt(2) + return out class AdaIN(nn.Layer): @@ -140,12 +166,21 @@ class AdaIN(nn.Layer): self.fc = nn.Linear(style_dim, num_features * 2) def forward(self, x: paddle.Tensor, s: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): Shape (B, style_dim, n_mels, T). + s(Tensor(float32)): Shape (style_dim, ). + Returns: + Tensor: + Shape (B, style_dim, T, n_mels, T). + """ if len(s.shape) == 1: s = s[None] h = self.fc(s) h = h.reshape((h.shape[0], h.shape[1], 1, 1)) gamma, beta = paddle.split(h, 2, axis=1) - return (1 + gamma) * self.norm(x) + beta + out = (1 + gamma) * self.norm(x) + beta + return out class AdainResBlk(nn.Layer): @@ -162,6 +197,7 @@ class AdainResBlk(nn.Layer): self.upsample = UpSample(layer_type=upsample) self.learned_sc = dim_in != dim_out self._build_weights(dim_in, dim_out, style_dim) + self.layer_type = upsample def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64): self.conv1 = nn.Conv2D( @@ -204,6 +240,18 @@ class AdainResBlk(nn.Layer): return x def forward(self, x: paddle.Tensor, s: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): + Shape (B, dim_in, n_mels, T). + s(Tensor(float32)): + Shape (64,). + Returns: + Tensor: + upsample == 'none': Shape (B, dim_out, T, n_mels, T). + upsample == 'timepreserve': Shape (B, dim_out, T, n_mels * 2, T). + upsample == 'half': Shape (B, dim_out, T, n_mels * 2, T * 2). + """ out = self._residual(x, s) if self.w_hpf == 0: out = (out + self._shortcut(x)) / math.sqrt(2) @@ -219,7 +267,8 @@ class HighPass(nn.Layer): def forward(self, x: paddle.Tensor): filter = self.filter.unsqueeze(0).unsqueeze(1).tile( [x.shape[1], 1, 1, 1]) - return F.conv2d(x, filter, padding=1, groups=x.shape[1]) + out = F.conv2d(x, filter, padding=1, groups=x.shape[1]) + return out class Generator(nn.Layer): @@ -276,12 +325,10 @@ class Generator(nn.Layer): w_hpf=w_hpf, upsample=_downtype)) # stack-like dim_in = dim_out - # bottleneck blocks (encoder) for _ in range(2): self.encode.append( ResBlk(dim_in=dim_out, dim_out=dim_out, normalize=True)) - # F0 blocks if F0_channel != 0: self.decode.insert(0, @@ -290,7 +337,6 @@ class Generator(nn.Layer): dim_out=dim_out, style_dim=style_dim, w_hpf=w_hpf)) - # bottleneck blocks (decoder) for _ in range(2): self.decode.insert(0, @@ -299,7 +345,6 @@ class Generator(nn.Layer): dim_out=dim_out + int(F0_channel / 2), style_dim=style_dim, w_hpf=w_hpf)) - if F0_channel != 0: self.F0_conv = nn.Sequential( ResBlk( @@ -307,7 +352,6 @@ class Generator(nn.Layer): dim_out=int(F0_channel / 2), normalize=True, downsample="half"), ) - if w_hpf > 0: self.hpf = HighPass(w_hpf) @@ -316,26 +360,44 @@ class Generator(nn.Layer): s: paddle.Tensor, masks: paddle.Tensor=None, F0: paddle.Tensor=None): + """Calculate forward propagation. + Args: + x(Tensor(float32)): + Shape (B, 1, n_mels, T). + s(Tensor(float32)): + Shape (64,). + masks: + None. + F0: + Shape (B, num_features(256), n_mels // 8, T). + Returns: + Tensor: + output of generator. Shape (B, 1, n_mels, T // 4 * 4) + """ x = self.stem(x) cache = {} + # output: (B, max_conv_dim, n_mels // 16, T // 4) for block in self.encode: if (masks is not None) and (x.shape[2] in [32, 64, 128]): cache[x.shape[2]] = x x = block(x) - if F0 is not None: + # input: (B, num_features(256), n_mels // 8, T) + # output: (B, num_features(256) // 2, n_mels // 16, T // 2) F0 = self.F0_conv(F0) + # output: (B, num_features(256) // 2, n_mels // 16, T // 4) F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]]) x = paddle.concat([x, F0], axis=1) - + # input: (B, max_conv_dim+num_features(256) // 2, n_mels // 16, T // 4 * 4) + # output: (B, dim_in, n_mels, T // 4 * 4) for block in self.decode: x = block(x, s) if (masks is not None) and (x.shape[2] in [32, 64, 128]): mask = masks[0] if x.shape[2] in [32] else masks[1] mask = F.interpolate(mask, size=x.shape[2], mode='bilinear') x = x + self.hpf(mask * cache[x.shape[2]]) - - return self.to_out(x) + out = self.to_out(x) + return out class MappingNetwork(nn.Layer): @@ -366,14 +428,25 @@ class MappingNetwork(nn.Layer): ]) def forward(self, z: paddle.Tensor, y: paddle.Tensor): + """Calculate forward propagation. + Args: + z(Tensor(float32)): + Shape (B, 1, n_mels, T). + y(Tensor(float32)): + speaker label. Shape (B, ). + Returns: + Tensor: + Shape (style_dim, ) + """ + h = self.shared(z) out = [] for layer in self.unshared: out += [layer(h)] - # (batch, num_domains, style_dim) + # (B, num_domains, style_dim) out = paddle.stack(out, axis=1) idx = paddle.arange(y.shape[0]) - # (batch, style_dim) + # (style_dim, ) s = out[idx, y] return s @@ -419,15 +492,25 @@ class StyleEncoder(nn.Layer): self.unshared.append(nn.Linear(dim_out, style_dim)) def forward(self, x: paddle.Tensor, y: paddle.Tensor): + """Calculate forward propagation. + Args: + x(Tensor(float32)): + Shape (B, 1, n_mels, T). + y(Tensor(float32)): + speaker label. Shape (B, ). + Returns: + Tensor: + Shape (style_dim, ) + """ h = self.shared(x) h = h.reshape((h.shape[0], -1)) out = [] for layer in self.unshared: out += [layer(h)] - # (batch, num_domains, style_dim) + # (B, num_domains, style_dim) out = paddle.stack(out, axis=1) idx = paddle.arange(y.shape[0]) - # (batch, style_dim) + # (style_dim,) s = out[idx, y] return s @@ -454,25 +537,12 @@ class Discriminator(nn.Layer): self.num_domains = num_domains def forward(self, x: paddle.Tensor, y: paddle.Tensor): - return self.dis(x, y) + out = self.dis(x, y) + return out def classifier(self, x: paddle.Tensor): - return self.cls.get_feature(x) - - -class LinearNorm(nn.Layer): - def __init__(self, - in_dim: int, - out_dim: int, - bias: bool=True, - w_init_gain: str='linear'): - super().__init__() - self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias) - xavier_uniform_( - self.linear_layer.weight, gain=_calculate_gain(w_init_gain)) - - def forward(self, x): - return self.linear_layer(x) + out = self.cls.get_feature(x) + return out class Discriminator2D(nn.Layer): @@ -520,97 +590,13 @@ class Discriminator2D(nn.Layer): def get_feature(self, x: paddle.Tensor): out = self.main(x) - # (batch, num_domains) + # (B, num_domains) out = out.reshape((out.shape[0], -1)) return out def forward(self, x: paddle.Tensor, y: paddle.Tensor): out = self.get_feature(x) idx = paddle.arange(y.shape[0]) - # (batch) + # (B,) ? out = out[idx, y] return out - - -''' -def build_model(args, F0_model: nn.Layer, ASR_model: nn.Layer): - generator = Generator( - dim_in=args.dim_in, - style_dim=args.style_dim, - max_conv_dim=args.max_conv_dim, - w_hpf=args.w_hpf, - F0_channel=args.F0_channel) - mapping_network = MappingNetwork( - latent_dim=args.latent_dim, - style_dim=args.style_dim, - num_domains=args.num_domains, - hidden_dim=args.max_conv_dim) - style_encoder = StyleEncoder( - dim_in=args.dim_in, - style_dim=args.style_dim, - num_domains=args.num_domains, - max_conv_dim=args.max_conv_dim) - discriminator = Discriminator( - dim_in=args.dim_in, - num_domains=args.num_domains, - max_conv_dim=args.max_conv_dim, - n_repeat=args.n_repeat) - generator_ema = copy.deepcopy(generator) - mapping_network_ema = copy.deepcopy(mapping_network) - style_encoder_ema = copy.deepcopy(style_encoder) - - nets = Munch( - generator=generator, - mapping_network=mapping_network, - style_encoder=style_encoder, - discriminator=discriminator, - f0_model=F0_model, - asr_model=ASR_model) - - nets_ema = Munch( - generator=generator_ema, - mapping_network=mapping_network_ema, - style_encoder=style_encoder_ema) - - return nets, nets_ema - - -class StarGANv2VC(nn.Layer): - def __init__( - self, - # spk_num - num_domains: int=20, - dim_in: int=64, - style_dim: int=64, - latent_dim: int=16, - max_conv_dim: int=512, - n_repeat: int=4, - w_hpf: int=0, - F0_channel: int=256): - super().__init__() - - self.generator = Generator( - dim_in=dim_in, - style_dim=style_dim, - max_conv_dim=max_conv_dim, - w_hpf=w_hpf, - F0_channel=F0_channel) - # MappingNetwork and StyleEncoder are used to generate reference_embeddings - self.mapping_network = MappingNetwork( - latent_dim=latent_dim, - style_dim=style_dim, - num_domains=num_domains, - hidden_dim=max_conv_dim) - - self.style_encoder = StyleEncoder( - dim_in=dim_in, - style_dim=style_dim, - num_domains=num_domains, - max_conv_dim=max_conv_dim) - - self.discriminator = Discriminator( - dim_in=dim_in, - num_domains=num_domains, - max_conv_dim=max_conv_dim, - repeat_num=n_repeat) -''' diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py index b0bb68d0f..12177fbc2 100644 --- a/paddlespeech/t2s/models/vits/duration_predictor.py +++ b/paddlespeech/t2s/models/vits/duration_predictor.py @@ -155,12 +155,10 @@ class StochasticDurationPredictor(nn.Layer): z_u, z1 = paddle.split(z_q, [1, 1], 1) u = F.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += paddle.sum( - (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask, [1, 2]) - logq = (paddle.sum(-0.5 * - (math.log(2 * math.pi) + - (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q) - + tmp1 = (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask + logdet_tot_q += paddle.sum(tmp1, [1, 2]) + tmp2 = -0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask + logq = (paddle.sum(tmp2, [1, 2]) - logdet_tot_q) logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) logdet_tot += logdet @@ -168,8 +166,8 @@ class StochasticDurationPredictor(nn.Layer): for flow in self.flows: z, logdet = flow(z, x_mask, g=x, inverse=inverse) logdet_tot = logdet_tot + logdet - nll = (paddle.sum(0.5 * (math.log(2 * math.pi) + - (z**2)) * x_mask, [1, 2]) - logdet_tot) + tmp3 = 0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask + nll = (paddle.sum(tmp3, [1, 2]) - logdet_tot) # (B,) return nll + logq else: diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py index 7593eb727..94df968a0 100644 --- a/paddlespeech/t2s/models/vits/flow.py +++ b/paddlespeech/t2s/models/vits/flow.py @@ -334,11 +334,12 @@ class ConvFlow(nn.Layer): unnorm_widths = h[..., :self.bins] / denom unnorm_heights = h[..., self.bins:2 * self.bins] / denom unnorm_derivatives = h[..., 2 * self.bins:] + xb, logdet_abs = piecewise_rational_quadratic_transform( - xb, - unnorm_widths, - unnorm_heights, - unnorm_derivatives, + inputs=xb, + unnormalized_widths=unnorm_widths, + unnormalized_heights=unnorm_heights, + unnormalized_derivatives=unnorm_derivatives, inverse=inverse, tails="linear", tail_bound=self.tail_bound, ) diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 7ecc51619..427ae09ed 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -279,6 +279,10 @@ class VITSGenerator(nn.Layer): from paddlespeech.t2s.models.vits.monotonic_align import maximum_path self.maximum_path = maximum_path + self.pad1d = nn.Pad1D( + padding=[1, 0], + mode='constant', + data_format='NLC', ) def forward( self, @@ -367,8 +371,9 @@ class VITSGenerator(nn.Layer): # (B, H, T_text) s_p_sq_r = paddle.exp(-2 * logs_p) # (B, 1, T_text) + tmp1 = -0.5 * math.log(2 * math.pi) - logs_p neg_x_ent_1 = paddle.sum( - -0.5 * math.log(2 * math.pi) - logs_p, + tmp1, [1], keepdim=True, ) # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text) @@ -380,8 +385,9 @@ class VITSGenerator(nn.Layer): z_p.transpose([0, 2, 1]), (m_p * s_p_sq_r), ) # (B, 1, T_text) + tmp2 = -0.5 * (m_p**2) * s_p_sq_r neg_x_ent_4 = paddle.sum( - -0.5 * (m_p**2) * s_p_sq_r, + tmp2, [1], keepdim=True, ) # (B, T_feats, T_text) @@ -399,7 +405,6 @@ class VITSGenerator(nn.Layer): w = attn.sum(2) dur_nll = self.duration_predictor(x, x_mask, w=w, g=g) dur_nll = dur_nll / paddle.sum(x_mask) - # expand the length to match with the feature sequence # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats) m_p = paddle.matmul(attn.squeeze(1), @@ -507,8 +512,9 @@ class VITSGenerator(nn.Layer): # (B, H, T_text) s_p_sq_r = paddle.exp(-2 * logs_p) # (B, 1, T_text) + tmp3 = -0.5 * math.log(2 * math.pi) - logs_p neg_x_ent_1 = paddle.sum( - -0.5 * math.log(2 * math.pi) - logs_p, + tmp3, [1], keepdim=True, ) # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text) @@ -520,8 +526,9 @@ class VITSGenerator(nn.Layer): z_p.transpose([0, 2, 1]), (m_p * s_p_sq_r), ) # (B, 1, T_text) + tmp4 = -0.5 * (m_p**2) * s_p_sq_r neg_x_ent_4 = paddle.sum( - -0.5 * (m_p**2) * s_p_sq_r, + tmp4, [1], keepdim=True, ) # (B, T_feats, T_text) @@ -552,8 +559,9 @@ class VITSGenerator(nn.Layer): y_lengths = paddle.cast( paddle.clip(paddle.sum(dur, [1, 2]), min=1), dtype='int64') y_mask = make_non_pad_mask(y_lengths).unsqueeze(1) - attn_mask = paddle.unsqueeze(x_mask, 2) * paddle.unsqueeze(y_mask, - -1) + tmp_a = paddle.cast(paddle.unsqueeze(x_mask, 2), dtype='int64') + tmp_b = paddle.cast(paddle.unsqueeze(y_mask, -1), dtype='int64') + attn_mask = tmp_a * tmp_b attn = self._generate_path(dur, attn_mask) # expand the length to match with the feature sequence @@ -685,5 +693,6 @@ class VITSGenerator(nn.Layer): ''' path = paddle.cast(path, dtype='float32') - path = path - F.pad(path, [0, 0, 1, 0, 0, 0])[:, :-1] + pad_tmp = self.pad1d(path)[:, :-1] + path = path - pad_tmp return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask diff --git a/paddlespeech/t2s/models/vits/transform.py b/paddlespeech/t2s/models/vits/transform.py index ea333dcff..917f28430 100644 --- a/paddlespeech/t2s/models/vits/transform.py +++ b/paddlespeech/t2s/models/vits/transform.py @@ -18,6 +18,7 @@ This code is based on https://github.com/bayesiains/nflows. """ import numpy as np import paddle +from paddle import nn from paddle.nn import functional as F from paddlespeech.t2s.modules.nets_utils import paddle_gather @@ -60,8 +61,12 @@ def piecewise_rational_quadratic_transform( def mask_preprocess(x, mask): + # bins.dtype = int32 B, C, T, bins = paddle.shape(x) - new_x = paddle.zeros([mask.sum(), bins]) + mask_int = paddle.cast(mask, dtype='int64') + # paddle.sum 输入是 int32 或 bool 的时候,输出是 int64 + # paddle.zeros (fill_constant) 的 shape 会被强制转成 int32 类型 + new_x = paddle.zeros([paddle.sum(mask_int), bins]) for i in range(bins): new_x[:, i] = x[:, :, :, i][mask] return new_x @@ -87,9 +92,9 @@ def unconstrained_rational_quadratic_spline( outputs = paddle.zeros(inputs.shape) logabsdet = paddle.zeros(inputs.shape) if tails == "linear": - unnormalized_derivatives = F.pad( - unnormalized_derivatives, - pad=[0] * (len(unnormalized_derivatives.shape) - 1) * 2 + [1, 1]) + # 注意 padding 的参数顺序 + pad2d = nn.Pad2D(padding=[1, 1, 0, 0], mode='constant') + unnormalized_derivatives = pad2d(unnormalized_derivatives) constant = np.log(np.exp(1 - min_derivative) - 1) unnormalized_derivatives[..., 0] = constant unnormalized_derivatives[..., -1] = constant @@ -142,6 +147,10 @@ def rational_quadratic_spline( # for dygraph to static # if paddle.min(inputs) < left or paddle.max(inputs) > right: # raise ValueError("Input to a transform is not within its domain") + pad1d = nn.Pad1D( + padding=[1, 0], + mode='constant', + data_format='NCL', ) num_bins = unnormalized_widths.shape[-1] # for dygraph to static @@ -153,11 +162,8 @@ def rational_quadratic_spline( widths = F.softmax(unnormalized_widths, axis=-1) widths = min_bin_width + (1 - min_bin_width * num_bins) * widths cumwidths = paddle.cumsum(widths, axis=-1) - cumwidths = F.pad( - cumwidths, - pad=[0] * (len(cumwidths.shape) - 1) * 2 + [1, 0], - mode="constant", - value=0.0) + + cumwidths = pad1d(cumwidths.unsqueeze(0)).squeeze() cumwidths = (right - left) * cumwidths + left cumwidths[..., 0] = left cumwidths[..., -1] = right @@ -168,11 +174,7 @@ def rational_quadratic_spline( heights = F.softmax(unnormalized_heights, axis=-1) heights = min_bin_height + (1 - min_bin_height * num_bins) * heights cumheights = paddle.cumsum(heights, axis=-1) - cumheights = F.pad( - cumheights, - pad=[0] * (len(cumheights.shape) - 1) * 2 + [1, 0], - mode="constant", - value=0.0) + cumheights = pad1d(cumheights.unsqueeze(0)).squeeze() cumheights = (top - bottom) * cumheights + bottom cumheights[..., 0] = bottom cumheights[..., -1] = top @@ -242,4 +244,7 @@ def rational_quadratic_spline( def _searchsorted(bin_locations, inputs, eps=1e-6): bin_locations[..., -1] += eps - return paddle.sum(inputs[..., None] >= bin_locations, axis=-1) - 1 + mask = inputs[..., None] >= bin_locations + mask_int = paddle.cast(mask, dtype='int64') + out = paddle.sum(mask_int, axis=-1) - 1 + return out diff --git a/paddlespeech/t2s/modules/activation.py b/paddlespeech/t2s/modules/activation.py index 8d8cd62ef..f1c099b76 100644 --- a/paddlespeech/t2s/modules/activation.py +++ b/paddlespeech/t2s/modules/activation.py @@ -37,7 +37,8 @@ def get_activation(act, **kwargs): "selu": paddle.nn.SELU, "leakyrelu": paddle.nn.LeakyReLU, "swish": paddle.nn.Swish, - "glu": GLU + "glu": GLU, + "gelu": paddle.nn.GELU, } return activation_funcs[act](**kwargs) diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 26a354565..6c416088b 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -113,7 +113,6 @@ class EncoderLayer(nn.Layer): x, pos_emb = x_input[0], x_input[1] else: x, pos_emb = x_input, None - skip_layer = False # with stochastic depth, residual connection `x + f(x)` becomes # `x <- x + 1 / (1 - p) * f(x)` at training time. @@ -121,14 +120,12 @@ class EncoderLayer(nn.Layer): if self.training and self.stochastic_depth_rate > 0: skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) - if skip_layer: if cache is not None: x = paddle.concat([cache, x], axis=1) if pos_emb is not None: return (x, pos_emb), mask return x, mask - # whether to use macaron style if self.feed_forward_macaron is not None: residual = x @@ -138,7 +135,6 @@ class EncoderLayer(nn.Layer): self.feed_forward_macaron(x)) if not self.normalize_before: x = self.norm_ff_macaron(x) - # multi-headed self-attention module residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/diffnet.py b/paddlespeech/t2s/modules/diffnet.py new file mode 100644 index 000000000..25339daea --- /dev/null +++ b/paddlespeech/t2s/modules/diffnet.py @@ -0,0 +1,245 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out +from paddlespeech.utils.initialize import kaiming_normal_ +from paddlespeech.utils.initialize import kaiming_uniform_ +from paddlespeech.utils.initialize import uniform_ +from paddlespeech.utils.initialize import zeros_ + + +def Conv1D(*args, **kwargs): + layer = nn.Conv1D(*args, **kwargs) + # Initialize the weight to be consistent with the official + kaiming_normal_(layer.weight) + + # Initialization is consistent with torch + if layer.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight) + if fan_in != 0: + bound = 1 / math.sqrt(fan_in) + uniform_(layer.bias, -bound, bound) + return layer + + +# Initialization is consistent with torch +def Linear(*args, **kwargs): + layer = nn.Linear(*args, **kwargs) + kaiming_uniform_(layer.weight, a=math.sqrt(5)) + if layer.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + uniform_(layer.bias, -bound, bound) + return layer + + +class ResidualBlock(nn.Layer): + """ResidualBlock + + Args: + encoder_hidden (int, optional): + Input feature size of the 1D convolution, by default 256 + residual_channels (int, optional): + Feature size of the residual output(and also the input), by default 256 + gate_channels (int, optional): + Output feature size of the 1D convolution, by default 512 + kernel_size (int, optional): + Kernel size of the 1D convolution, by default 3 + dilation (int, optional): + Dilation of the 1D convolution, by default 4 + """ + + def __init__(self, + encoder_hidden: int=256, + residual_channels: int=256, + gate_channels: int=512, + kernel_size: int=3, + dilation: int=4): + super().__init__() + self.dilated_conv = Conv1D( + residual_channels, + gate_channels, + kernel_size, + padding=dilation, + dilation=dilation) + self.diffusion_projection = Linear(residual_channels, residual_channels) + self.conditioner_projection = Conv1D(encoder_hidden, gate_channels, 1) + self.output_projection = Conv1D(residual_channels, gate_channels, 1) + + def forward( + self, + x: paddle.Tensor, + diffusion_step: paddle.Tensor, + cond: paddle.Tensor, ): + """Calculate forward propagation. + Args: + spec (Tensor(float32)): input feature. (B, residual_channels, T) + diffusion_step (Tensor(int64)): The timestep input (adding noise step). (B,) + cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, residual_channels, T) + + Returns: + x (Tensor(float32)): output (B, residual_channels, T) + + """ + diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1) + cond = self.conditioner_projection(cond) + y = x + diffusion_step + + y = self.dilated_conv(y) + cond + + gate, filter = paddle.chunk(y, 2, axis=1) + y = F.sigmoid(gate) * paddle.tanh(filter) + + y = self.output_projection(y) + residual, skip = paddle.chunk(y, 2, axis=1) + return (x + residual) / math.sqrt(2.0), skip + + +class SinusoidalPosEmb(nn.Layer): + """Positional embedding + """ + + def __init__(self, dim: int=256): + super().__init__() + self.dim = dim + + def forward(self, x: paddle.Tensor): + x = paddle.cast(x, 'float32') + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = paddle.exp(paddle.arange(half_dim) * -emb) + emb = x[:, None] * emb[None, :] + emb = paddle.concat([emb.sin(), emb.cos()], axis=-1) + return emb + + +class DiffNet(nn.Layer): + """A Mel-Spectrogram Denoiser + + Args: + in_channels (int, optional): + Number of channels of the input mel-spectrogram, by default 80 + out_channels (int, optional): + Number of channels of the output mel-spectrogram, by default 80 + kernel_size (int, optional): + Kernel size of the residual blocks inside, by default 3 + layers (int, optional): + Number of residual blocks inside, by default 20 + stacks (int, optional): + The number of groups to split the residual blocks into, by default 5 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): + Residual channel of the residual blocks, by default 256 + gate_channels (int, optional): + Gate channel of the residual blocks, by default 512 + skip_channels (int, optional): + Skip channel of the residual blocks, by default 256 + aux_channels (int, optional): + Auxiliary channel of the residual blocks, by default 256 + dropout (float, optional): + Dropout of the residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight norm in all convolutions, by default False + """ + + def __init__( + self, + in_channels: int=80, + out_channels: int=80, + kernel_size: int=3, + layers: int=20, + stacks: int=5, + residual_channels: int=256, + gate_channels: int=512, + skip_channels: int=256, + aux_channels: int=256, + dropout: float=0., + bias: bool=True, + use_weight_norm: bool=False, + init_type: str="kaiming_normal", ): + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + self.layers = layers + self.aux_channels = aux_channels + self.residual_channels = residual_channels + self.gate_channels = gate_channels + self.kernel_size = kernel_size + self.dilation_cycle_length = layers // stacks + self.skip_channels = skip_channels + + self.input_projection = Conv1D(self.in_channels, self.residual_channels, + 1) + self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels) + dim = self.residual_channels + self.mlp = nn.Sequential( + Linear(dim, dim * 4), nn.Mish(), Linear(dim * 4, dim)) + self.residual_layers = nn.LayerList([ + ResidualBlock( + encoder_hidden=self.aux_channels, + residual_channels=self.residual_channels, + gate_channels=self.gate_channels, + kernel_size=self.kernel_size, + dilation=2**(i % self.dilation_cycle_length)) + for i in range(self.layers) + ]) + self.skip_projection = Conv1D(self.residual_channels, + self.skip_channels, 1) + self.output_projection = Conv1D(self.residual_channels, + self.out_channels, 1) + zeros_(self.output_projection.weight) + + def forward( + self, + spec: paddle.Tensor, + diffusion_step: paddle.Tensor, + cond: paddle.Tensor, ): + """Calculate forward propagation. + Args: + spec (Tensor(float32)): The input mel-spectrogram. (B, n_mel, T) + diffusion_step (Tensor(int64)): The timestep input (adding noise step). (B,) + cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, D_enc_out, T) + + Returns: + x (Tensor(float32)): pred noise (B, n_mel, T) + + """ + x = spec + x = self.input_projection(x) # x [B, residual_channel, T] + + x = F.relu(x) + diffusion_step = self.diffusion_embedding(diffusion_step) + diffusion_step = self.mlp(diffusion_step) + skip = [] + for layer_id, layer in enumerate(self.residual_layers): + x, skip_connection = layer( + x=x, + diffusion_step=diffusion_step, + cond=cond, ) + skip.append(skip_connection) + x = paddle.sum( + paddle.stack(skip), axis=0) / math.sqrt(len(self.residual_layers)) + x = self.skip_projection(x) + x = F.relu(x) + x = self.output_projection(x) # [B, 80, T] + return x diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index be684ce38..3222a8032 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -17,6 +17,7 @@ from typing import Callable from typing import Optional from typing import Tuple +import numpy as np import paddle import ppdiffusers from paddle import nn @@ -27,170 +28,6 @@ from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock -class WaveNetDenoiser(nn.Layer): - """A Mel-Spectrogram Denoiser modified from WaveNet - - Args: - in_channels (int, optional): - Number of channels of the input mel-spectrogram, by default 80 - out_channels (int, optional): - Number of channels of the output mel-spectrogram, by default 80 - kernel_size (int, optional): - Kernel size of the residual blocks inside, by default 3 - layers (int, optional): - Number of residual blocks inside, by default 20 - stacks (int, optional): - The number of groups to split the residual blocks into, by default 5 - Within each group, the dilation of the residual block grows exponentially. - residual_channels (int, optional): - Residual channel of the residual blocks, by default 256 - gate_channels (int, optional): - Gate channel of the residual blocks, by default 512 - skip_channels (int, optional): - Skip channel of the residual blocks, by default 256 - aux_channels (int, optional): - Auxiliary channel of the residual blocks, by default 256 - dropout (float, optional): - Dropout of the residual blocks, by default 0. - bias (bool, optional): - Whether to use bias in residual blocks, by default True - use_weight_norm (bool, optional): - Whether to use weight norm in all convolutions, by default False - """ - - def __init__( - self, - in_channels: int=80, - out_channels: int=80, - kernel_size: int=3, - layers: int=20, - stacks: int=5, - residual_channels: int=256, - gate_channels: int=512, - skip_channels: int=256, - aux_channels: int=256, - dropout: float=0., - bias: bool=True, - use_weight_norm: bool=False, - init_type: str="kaiming_normal", ): - super().__init__() - - # initialize parameters - initialize(self, init_type) - - self.in_channels = in_channels - self.out_channels = out_channels - self.aux_channels = aux_channels - self.layers = layers - self.stacks = stacks - self.kernel_size = kernel_size - - assert layers % stacks == 0 - layers_per_stack = layers // stacks - - self.first_t_emb = nn.Sequential( - Timesteps( - residual_channels, - flip_sin_to_cos=False, - downscale_freq_shift=1), - nn.Linear(residual_channels, residual_channels * 4), - nn.Mish(), nn.Linear(residual_channels * 4, residual_channels)) - self.t_emb_layers = nn.LayerList([ - nn.Linear(residual_channels, residual_channels) - for _ in range(layers) - ]) - - self.first_conv = nn.Conv1D( - in_channels, residual_channels, 1, bias_attr=True) - self.first_act = nn.ReLU() - - self.conv_layers = nn.LayerList() - for layer in range(layers): - dilation = 2**(layer % layers_per_stack) - conv = WaveNetResidualBlock( - kernel_size=kernel_size, - residual_channels=residual_channels, - gate_channels=gate_channels, - skip_channels=skip_channels, - aux_channels=aux_channels, - dilation=dilation, - dropout=dropout, - bias=bias) - self.conv_layers.append(conv) - - final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True) - nn.initializer.Constant(0.0)(final_conv.weight) - self.last_conv_layers = nn.Sequential(nn.ReLU(), - nn.Conv1D( - skip_channels, - skip_channels, - 1, - bias_attr=True), - nn.ReLU(), final_conv) - - if use_weight_norm: - self.apply_weight_norm() - - def forward(self, x, t, c): - """Denoise mel-spectrogram. - - Args: - x(Tensor): - Shape (N, C_in, T), The input mel-spectrogram. - t(Tensor): - Shape (N), The timestep input. - c(Tensor): - Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). - - Returns: - Tensor: Shape (N, C_out, T), the denoised mel-spectrogram. - """ - assert c.shape[-1] == x.shape[-1] - - if t.shape[0] != x.shape[0]: - t = t.tile([x.shape[0]]) - t_emb = self.first_t_emb(t) - t_embs = [ - t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers - ] - - x = self.first_conv(x) - x = self.first_act(x) - skips = 0 - for f, t in zip(self.conv_layers, t_embs): - x = x + t - x, s = f(x, c) - skips += s - skips *= math.sqrt(1.0 / len(self.conv_layers)) - - x = self.last_conv_layers(skips) - return x - - def apply_weight_norm(self): - """Recursively apply weight normalization to all the Convolution layers - in the sublayers. - """ - - def _apply_weight_norm(layer): - if isinstance(layer, (nn.Conv1D, nn.Conv2D)): - nn.utils.weight_norm(layer) - - self.apply(_apply_weight_norm) - - def remove_weight_norm(self): - """Recursively remove weight normalization from all the Convolution - layers in the sublayers. - """ - - def _remove_weight_norm(layer): - try: - nn.utils.remove_weight_norm(layer) - except ValueError: - pass - - self.apply(_remove_weight_norm) - - class GaussianDiffusion(nn.Layer): """Common Gaussian Diffusion Denoising Model Module @@ -207,6 +44,13 @@ class GaussianDiffusion(nn.Layer): beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule). num_max_timesteps (int, optional): The max timestep transition from real to noise, by default None. + stretch (bool, optional): + Whether to stretch before diffusion, by defalut True. + min_values: (paddle.Tensor): + The minimum value of the feature to stretch. + max_values: (paddle.Tensor): + The maximum value of the feature to stretch. + Examples: >>> import paddle @@ -294,13 +138,17 @@ class GaussianDiffusion(nn.Layer): """ - def __init__(self, - denoiser: nn.Layer, - num_train_timesteps: Optional[int]=1000, - beta_start: Optional[float]=0.0001, - beta_end: Optional[float]=0.02, - beta_schedule: Optional[str]="squaredcos_cap_v2", - num_max_timesteps: Optional[int]=None): + def __init__( + self, + denoiser: nn.Layer, + num_train_timesteps: Optional[int]=1000, + beta_start: Optional[float]=0.0001, + beta_end: Optional[float]=0.02, + beta_schedule: Optional[str]="squaredcos_cap_v2", + num_max_timesteps: Optional[int]=None, + stretch: bool=True, + min_values: paddle.Tensor=None, + max_values: paddle.Tensor=None, ): super().__init__() self.num_train_timesteps = num_train_timesteps @@ -315,6 +163,22 @@ class GaussianDiffusion(nn.Layer): beta_end=beta_end, beta_schedule=beta_schedule) self.num_max_timesteps = num_max_timesteps + self.stretch = stretch + self.min_values = min_values + self.max_values = max_values + + def norm_spec(self, x): + """ + Linearly map x to [-1, 1] + Args: + x: [B, T, N] + """ + return (x - self.min_values) / (self.max_values - self.min_values + ) * 2 - 1 + + def denorm_spec(self, x): + return (x + 1) / 2 * (self.max_values - self.min_values + ) + self.min_values def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None ) -> Tuple[paddle.Tensor, paddle.Tensor]: @@ -333,6 +197,11 @@ class GaussianDiffusion(nn.Layer): The noises which is added to the input. """ + if self.stretch: + x = x.transpose((0, 2, 1)) + x = self.norm_spec(x) + x = x.transpose((0, 2, 1)) + noise_scheduler = self.noise_scheduler # Sample noise that we'll add to the mel-spectrograms @@ -360,7 +229,7 @@ class GaussianDiffusion(nn.Layer): num_inference_steps: Optional[int]=1000, strength: Optional[float]=None, scheduler_type: Optional[str]="ddpm", - clip_noise: Optional[bool]=True, + clip_noise: Optional[bool]=False, clip_noise_range: Optional[Tuple[float, float]]=(-1, 1), callback: Optional[Callable[[int, int, int, paddle.Tensor], None]]=None, @@ -369,9 +238,9 @@ class GaussianDiffusion(nn.Layer): Args: noise (Tensor): - The input tensor as a starting point for denoising. + The input tensor as a starting point for denoising. cond (Tensor, optional): - Conditional input for compute noises. + Conditional input for compute noises. (N, C_aux, T) ref_x (Tensor, optional): The real output for the denoising process to refer. num_inference_steps (int, optional): @@ -382,6 +251,7 @@ class GaussianDiffusion(nn.Layer): scheduler_type (str, optional): Noise scheduler for generate noises. Choose a great scheduler can skip many denoising step, by default 'ddpm'. + only support 'ddpm' now ! clip_noise (bool, optional): Whether to clip each denoised output, by default True. clip_noise_range (tuple, optional): @@ -425,48 +295,33 @@ class GaussianDiffusion(nn.Layer): # set timesteps scheduler.set_timesteps(num_inference_steps) - # prepare first noise variables noisy_input = noise - timesteps = scheduler.timesteps - if ref_x is not None: - init_timestep = None - if strength is None or strength < 0. or strength > 1.: - strength = None - if self.num_max_timesteps is not None: - strength = self.num_max_timesteps / self.num_train_timesteps - if strength is not None: - # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = scheduler.timesteps[t_start:] - num_inference_steps = num_inference_steps - t_start - noisy_input = scheduler.add_noise( - ref_x, noise, timesteps[:1].tile([noise.shape[0]])) - - # denoising loop + if self.stretch and ref_x is not None: + ref_x = ref_x.transpose((0, 2, 1)) + ref_x = self.norm_spec(ref_x) + ref_x = ref_x.transpose((0, 2, 1)) + + # for ddpm + timesteps = paddle.to_tensor( + np.flipud(np.arange(num_inference_steps))) + noisy_input = scheduler.add_noise(ref_x, noise, timesteps[0]) + denoised_output = noisy_input if clip_noise: n_min, n_max = clip_noise_range denoised_output = paddle.clip(denoised_output, n_min, n_max) - num_warmup_steps = len( - timesteps) - num_inference_steps * scheduler.order for i, t in enumerate(timesteps): denoised_output = scheduler.scale_model_input(denoised_output, t) - - # predict the noise residual noise_pred = self.denoiser(denoised_output, t, cond) - # compute the previous noisy sample x_t -> x_t-1 denoised_output = scheduler.step(noise_pred, t, denoised_output).prev_sample if clip_noise: denoised_output = paddle.clip(denoised_output, n_min, n_max) - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and - (i + 1) % scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, len(timesteps), denoised_output) + if self.stretch: + denoised_output = denoised_output.transpose((0, 2, 1)) + denoised_output = self.denorm_spec(denoised_output) + denoised_output = denoised_output.transpose((0, 2, 1)) return denoised_output diff --git a/paddlespeech/t2s/modules/masked_fill.py b/paddlespeech/t2s/modules/masked_fill.py index b32222547..1445a926a 100644 --- a/paddlespeech/t2s/modules/masked_fill.py +++ b/paddlespeech/t2s/modules/masked_fill.py @@ -38,11 +38,9 @@ def masked_fill(xs: paddle.Tensor, value: Union[float, int]): # comment following line for converting dygraph to static graph. # assert is_broadcastable(xs.shape, mask.shape) is True - # bshape = paddle.broadcast_shape(xs.shape, mask.shape) bshape = broadcast_shape(xs.shape, mask.shape) mask.stop_gradient = True mask = mask.broadcast_to(bshape) - trues = paddle.ones_like(xs) * value mask = mask.cast(dtype=paddle.bool) xs = paddle.where(mask, trues, xs) diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 798e4dee8..99130acca 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -145,18 +145,18 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): bs = paddle.shape(lengths)[0] if xs is None: - maxlen = lengths.max() + maxlen = paddle.cast(lengths.max(), dtype=bs.dtype) else: maxlen = paddle.shape(xs)[length_dim] seq_range = paddle.arange(0, maxlen, dtype=paddle.int64) + # VITS 最后一个 expand 的位置 seq_range_expand = seq_range.unsqueeze(0).expand([bs, maxlen]) seq_length_expand = lengths.unsqueeze(-1) mask = seq_range_expand >= seq_length_expand.cast(seq_range_expand.dtype) if xs is not None: assert paddle.shape(xs)[0] == bs, (paddle.shape(xs)[0], bs) - if length_dim < 0: length_dim = len(paddle.shape(xs)) + length_dim # ind = (:, None, ..., None, :, , None, ..., None) diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py index 4c2a67cc4..197f73595 100644 --- a/paddlespeech/t2s/modules/predictor/variance_predictor.py +++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py @@ -96,7 +96,7 @@ class VariancePredictor(nn.Layer): xs = f(xs) # (B, Tmax, 1) xs = self.linear(xs.transpose([0, 2, 1])) - + if x_masks is not None: xs = masked_fill(xs, x_masks, 0.0) return xs diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index e3c9a992a..3237be1b6 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -103,7 +103,7 @@ class MultiHeadedAttention(nn.Layer): mask = paddle.logical_not(mask) # assume scores.dtype==paddle.float32, we only use "float32" here dtype = str(scores.dtype).split(".")[-1] - min_value = numpy.finfo(dtype).min + min_value = float(numpy.finfo(dtype).min) scores = masked_fill(scores, mask, min_value) # (batch, head, time1, time2) self.attn = softmax(scores) @@ -192,12 +192,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): x_padded = paddle.concat([zero_pad, x], axis=-1) x_padded = x_padded.reshape([b, h, t2 + 1, t1]) # only keep the positions from 0 to time2 - x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1] - + new_t = paddle.cast(paddle.floor(t2 / 2) + 1, dtype='int32') + x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :new_t] if self.zero_triu: ones = paddle.ones((t1, t2)) x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] - return x def forward(self, query, key, value, pos_emb, mask): @@ -221,7 +220,6 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): q, k, v = self.forward_qkv(query, key, value) # (batch, time1, head, d_k) q = q.transpose([0, 2, 1, 3]) - n_batch_pos = paddle.shape(pos_emb)[0] p = self.linear_pos(pos_emb).reshape( [n_batch_pos, -1, self.h, self.d_k]) diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 7ba301cbd..f90eb44a4 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -198,7 +198,8 @@ class RelPositionalEncoding(nn.Layer): x = x * self.xscale T = paddle.shape(x)[1] pe_size = paddle.shape(self.pe) - pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ] + tmp = paddle.cast(paddle.floor(pe_size[1] / 2), dtype='int32') + pos_emb = self.pe[:, tmp - T + 1:tmp + T, ] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index f2aed5892..0fd94689d 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -15,6 +15,7 @@ from typing import List from typing import Union +import paddle from paddle import nn from paddlespeech.t2s.modules.activation import get_activation @@ -390,7 +391,13 @@ class TransformerEncoder(BaseEncoder): padding_idx=padding_idx, encoder_type="transformer") - def forward(self, xs, masks): + def forward(self, + xs: paddle.Tensor, + masks: paddle.Tensor, + note_emb: paddle.Tensor=None, + note_dur_emb: paddle.Tensor=None, + is_slur_emb: paddle.Tensor=None, + scale: int=16): """Encoder input sequence. Args: @@ -398,6 +405,12 @@ class TransformerEncoder(BaseEncoder): Input tensor (#batch, time, idim). masks(Tensor): Mask tensor (#batch, 1, time). + note_emb(Tensor): + Input tensor (#batch, time, attention_dim). + note_dur_emb(Tensor): + Input tensor (#batch, time, attention_dim). + is_slur_emb(Tensor): + Input tensor (#batch, time, attention_dim). Returns: Tensor: @@ -406,6 +419,8 @@ class TransformerEncoder(BaseEncoder): Mask tensor (#batch, 1, time). """ xs = self.embed(xs) + if note_emb is not None: + xs = scale * xs + note_emb + note_dur_emb + is_slur_emb xs, masks = self.encoders(xs, masks) if self.normalize_before: xs = self.after_norm(xs) diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index 91d67ca58..a322becd0 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -69,8 +69,8 @@ class MultiLayeredConv1d(nn.Layer): Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) - return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( - [0, 2, 1]) + out = self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose([0, 2, 1]) + return out class Conv1dLinear(nn.Layer): diff --git a/paddlespeech/t2s/modules/wavenet_denoiser.py b/paddlespeech/t2s/modules/wavenet_denoiser.py new file mode 100644 index 000000000..f84a0893d --- /dev/null +++ b/paddlespeech/t2s/modules/wavenet_denoiser.py @@ -0,0 +1,191 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Callable +from typing import Optional +from typing import Tuple + +import numpy as np +import paddle +import ppdiffusers +from paddle import nn +from ppdiffusers.models.embeddings import Timesteps +from ppdiffusers.schedulers import DDPMScheduler + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock + + +class WaveNetDenoiser(nn.Layer): + """A Mel-Spectrogram Denoiser modified from WaveNet + + Args: + in_channels (int, optional): + Number of channels of the input mel-spectrogram, by default 80 + out_channels (int, optional): + Number of channels of the output mel-spectrogram, by default 80 + kernel_size (int, optional): + Kernel size of the residual blocks inside, by default 3 + layers (int, optional): + Number of residual blocks inside, by default 20 + stacks (int, optional): + The number of groups to split the residual blocks into, by default 5 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): + Residual channel of the residual blocks, by default 256 + gate_channels (int, optional): + Gate channel of the residual blocks, by default 512 + skip_channels (int, optional): + Skip channel of the residual blocks, by default 256 + aux_channels (int, optional): + Auxiliary channel of the residual blocks, by default 256 + dropout (float, optional): + Dropout of the residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight norm in all convolutions, by default False + """ + + def __init__( + self, + in_channels: int=80, + out_channels: int=80, + kernel_size: int=3, + layers: int=20, + stacks: int=5, + residual_channels: int=256, + gate_channels: int=512, + skip_channels: int=256, + aux_channels: int=256, + dropout: float=0., + bias: bool=True, + use_weight_norm: bool=False, + init_type: str="kaiming_normal", ): + super().__init__() + + # initialize parameters + initialize(self, init_type) + + self.in_channels = in_channels + self.out_channels = out_channels + self.aux_channels = aux_channels + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + self.first_t_emb = nn.Sequential( + Timesteps( + residual_channels, + flip_sin_to_cos=False, + downscale_freq_shift=1), + nn.Linear(residual_channels, residual_channels * 4), + nn.Mish(), nn.Linear(residual_channels * 4, residual_channels)) + self.t_emb_layers = nn.LayerList([ + nn.Linear(residual_channels, residual_channels) + for _ in range(layers) + ]) + + self.first_conv = nn.Conv1D( + in_channels, residual_channels, 1, bias_attr=True) + self.first_act = nn.ReLU() + + self.conv_layers = nn.LayerList() + for layer in range(layers): + dilation = 2**(layer % layers_per_stack) + conv = WaveNetResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=aux_channels, + dilation=dilation, + dropout=dropout, + bias=bias) + self.conv_layers.append(conv) + + final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True) + nn.initializer.Constant(0.0)(final_conv.weight) + self.last_conv_layers = nn.Sequential(nn.ReLU(), + nn.Conv1D( + skip_channels, + skip_channels, + 1, + bias_attr=True), + nn.ReLU(), final_conv) + + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x: paddle.Tensor, t: paddle.Tensor, c: paddle.Tensor): + """Denoise mel-spectrogram. + + Args: + x(Tensor): + Shape (B, C_in, T), The input mel-spectrogram. + t(Tensor): + Shape (B), The timestep input. + c(Tensor): + Shape (B, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). + + Returns: + Tensor: Shape (B, C_out, T), the pred noise. + """ + assert c.shape[-1] == x.shape[-1] + + if t.shape[0] != x.shape[0]: + t = t.tile([x.shape[0]]) + t_emb = self.first_t_emb(t) + t_embs = [ + t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers + ] + + x = self.first_conv(x) + x = self.first_act(x) + skips = 0 + for f, t in zip(self.conv_layers, t_embs): + x = x + t + x, s = f(x, c) + skips += s + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + x = self.last_conv_layers(skips) + return x + + def apply_weight_norm(self): + """Recursively apply weight normalization to all the Convolution layers + in the sublayers. + """ + + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv2D)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Recursively remove weight normalization from all the Convolution + layers in the sublayers. + """ + + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) diff --git a/setup.py b/setup.py index 970cb984a..1545c6139 100644 --- a/setup.py +++ b/setup.py @@ -64,8 +64,8 @@ base = [ "sacrebleu", "textgrid", "timer", - "ToJyutping", - "typeguard", + "ToJyutping==0.2.1", + "typeguard==2.13.3", "webrtcvad", "yacs~=0.1.8", "zhon",