diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 761edbc01..5a409e062 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
import platform
COPYRIGHT = '''
-Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -128,4 +128,4 @@ def main(argv=None):
if __name__ == '__main__':
- exit(main())
\ No newline at end of file
+ exit(main())
diff --git a/README.md b/README.md
index 0cb99d1c6..fbbb1480f 100644
--- a/README.md
+++ b/README.md
@@ -178,7 +178,10 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
### Recent Update
-- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo](./demos/TTSArmLinux).
+- 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
+- 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
+- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux).
+- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3).
- 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3).
- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition).
- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/).
@@ -575,14 +578,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
- Text Frontend |
- |
-
- tn / g2p
- |
+ Text Frontend |
+ |
+
+ tn / g2p
+ |
- Acoustic Model |
+ Acoustic Model |
Tacotron2 |
LJSpeech / CSMSC |
@@ -617,6 +620,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en
|
+
+ DiffSinger |
+ Opencpop |
+
+ DiffSinger-opencpop
+ |
+
Vocoder |
WaveFlow |
@@ -627,9 +637,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
Parallel WaveGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3
+ PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 / PWGAN-opencpop
|
@@ -648,9 +658,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
HiFiGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3
+ HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 / HiFiGAN-opencpop
|
diff --git a/README_cn.md b/README_cn.md
index 0f2adf811..4d991f3e8 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -183,7 +183,10 @@
- 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。
### 近期更新
-- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例](./demos/TTSArmLinux)。
+- 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例,包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5),效果持续优化中。
+- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
+- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。
+- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。
- 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。
- 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。
- 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。
@@ -574,43 +577,50 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
tn / g2p
|
-
-
- 声学模型 |
+
+
+ 声学模型 |
Tacotron2 |
LJSpeech / CSMSC |
tacotron2-ljspeech / tacotron2-csmsc
|
-
-
+
+
Transformer TTS |
LJSpeech |
transformer-ljspeech
|
-
-
+
+
SpeedySpeech |
CSMSC |
speedyspeech-csmsc
|
-
-
+
+
FastSpeech2 |
LJSpeech / VCTK / CSMSC / AISHELL-3 / ZH_EN / finetune |
fastspeech2-ljspeech / fastspeech2-vctk / fastspeech2-csmsc / fastspeech2-aishell3 / fastspeech2-zh_en / fastspeech2-finetune
|
-
-
+
+
ERNIE-SAT |
VCTK / AISHELL-3 / ZH_EN |
ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en
|
-
+
+
+ DiffSinger |
+ Opencpop |
+
+ DiffSinger-opencpop
+ |
+
声码器 |
WaveFlow |
@@ -621,9 +631,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
Parallel WaveGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3
+ PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 / PWGAN-opencpop
|
@@ -642,9 +652,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
HiFiGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3
+ HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 / HiFiGAN-opencpop
|
@@ -701,6 +711,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
+
**声音分类**
diff --git a/demos/TTSArmLinux/.gitignore b/demos/TTSArmLinux/.gitignore
index 13135e376..f18480d7a 100644
--- a/demos/TTSArmLinux/.gitignore
+++ b/demos/TTSArmLinux/.gitignore
@@ -1,4 +1,8 @@
+# 目录
build/
output/
libs/
models/
+
+# 符号连接
+dict
diff --git a/demos/TTSArmLinux/README.md b/demos/TTSArmLinux/README.md
index 32b85e0a4..a4ccba6c8 100644
--- a/demos/TTSArmLinux/README.md
+++ b/demos/TTSArmLinux/README.md
@@ -10,9 +10,9 @@
### 安装依赖
-```
+```bash
# Ubuntu
-sudo apt install build-essential cmake wget tar unzip
+sudo apt install build-essential cmake pkg-config wget tar unzip
# CentOS
sudo yum groupinstall "Development Tools"
@@ -25,15 +25,13 @@ sudo yum install cmake wget tar unzip
可用以下命令下载:
-```
-git clone https://github.com/PaddlePaddle/PaddleSpeech.git
-cd PaddleSpeech/demos/TTSArmLinux
+```bash
./download.sh
```
### 编译 Demo
-```
+```bash
./build.sh
```
@@ -43,12 +41,18 @@ cd PaddleSpeech/demos/TTSArmLinux
### 运行
-```
+你可以修改 `./front.conf` 中 `--phone2id_path` 参数为你自己的声学模型的 `phone_id_map.txt` 。
+
+```bash
./run.sh
+./run.sh --sentence "语音合成测试"
+./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
+./run.sh --help
```
-将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件,保存在 `output` 文件夹中。
+目前只支持中文合成,出现任何英文都会导致程序崩溃。
+如果未指定`--wav_file`,默认输出到`./output/tts.wav`。
## 手动编译 Paddle Lite 库
diff --git a/demos/TTSArmLinux/build-depends.sh b/demos/TTSArmLinux/build-depends.sh
new file mode 120000
index 000000000..fd3aec9c8
--- /dev/null
+++ b/demos/TTSArmLinux/build-depends.sh
@@ -0,0 +1 @@
+src/TTSCppFrontend/build-depends.sh
\ No newline at end of file
diff --git a/demos/TTSArmLinux/build.sh b/demos/TTSArmLinux/build.sh
index c872e5749..5d31173ef 100755
--- a/demos/TTSArmLinux/build.sh
+++ b/demos/TTSArmLinux/build.sh
@@ -1,8 +1,11 @@
#!/bin/bash
set -e
+set -x
cd "$(dirname "$(realpath "$0")")"
+BASE_DIR="$PWD"
+
# load configure
. ./config.sh
@@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")"
echo "ARM_ABI is ${ARM_ABI}"
echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"
-rm -rf build
-mkdir -p build
-cd build
+echo "Build depends..."
+./build-depends.sh "$@"
+mkdir -p "$BASE_DIR/build"
+cd "$BASE_DIR/build"
cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
-make
+
+if [ "$*" = "" ]; then
+ make -j$(nproc)
+else
+ make "$@"
+fi
echo "make successful!"
diff --git a/demos/TTSArmLinux/clean.sh b/demos/TTSArmLinux/clean.sh
index 1ea365566..2743801c3 100755
--- a/demos/TTSArmLinux/clean.sh
+++ b/demos/TTSArmLinux/clean.sh
@@ -1,8 +1,11 @@
#!/bin/bash
set -e
+set -x
cd "$(dirname "$(realpath "$0")")"
+BASE_DIR="$PWD"
+
# load configure
. ./config.sh
@@ -12,3 +15,9 @@ set -x
rm -rf "$OUTPUT_DIR"
rm -rf "$LIBS_DIR"
rm -rf "$MODELS_DIR"
+rm -rf "$BASE_DIR/build"
+
+"$BASE_DIR/src/TTSCppFrontend/clean.sh"
+
+# 符号连接
+rm "$BASE_DIR/dict"
diff --git a/demos/TTSArmLinux/config.sh b/demos/TTSArmLinux/config.sh
index 0a04f18ee..bf38d7d6d 100644
--- a/demos/TTSArmLinux/config.sh
+++ b/demos/TTSArmLinux/config.sh
@@ -10,5 +10,6 @@ OUTPUT_DIR="${PWD}/output"
PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_extra.with_cv/cxx"
#PADDLE_LITE_DIR="/path/to/Paddle-Lite/build.lite.linux.${ARM_ABI}.gcc/inference_lite_lib.armlinux.${ARM_ABI}/cxx"
-AM_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
-VOC_MODEL_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
+ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
+VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
+FRONT_CONF="${PWD}/front.conf"
diff --git a/demos/TTSArmLinux/download.sh b/demos/TTSArmLinux/download.sh
index 560374bc9..7eaa836a5 100755
--- a/demos/TTSArmLinux/download.sh
+++ b/demos/TTSArmLinux/download.sh
@@ -3,6 +3,8 @@ set -e
cd "$(dirname "$(realpath "$0")")"
+BASE_DIR="$PWD"
+
# load configure
. ./config.sh
@@ -38,6 +40,10 @@ download() {
echo '======================='
}
+########################################
+
+echo "Download models..."
+
download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'39e0c6604f97c70f5d13c573d7e709b9' \
@@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
"$MODELS_DIR"
echo "Done."
+
+########################################
+
+echo "Download dictionary files..."
+
+ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
+
+"$BASE_DIR/src/TTSCppFrontend/download.sh"
diff --git a/demos/TTSArmLinux/front.conf b/demos/TTSArmLinux/front.conf
new file mode 100644
index 000000000..04bd2d97f
--- /dev/null
+++ b/demos/TTSArmLinux/front.conf
@@ -0,0 +1,21 @@
+# jieba conf
+--jieba_dict_path=./dict/jieba/jieba.dict.utf8
+--jieba_hmm_path=./dict/jieba/hmm_model.utf8
+--jieba_user_dict_path=./dict/jieba/user.dict.utf8
+--jieba_idf_path=./dict/jieba/idf.utf8
+--jieba_stop_word_path=./dict/jieba/stop_words.utf8
+
+# dict conf fastspeech2_0.4
+--seperate_tone=false
+--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+
+# dict conf speedyspeech_0.5
+#--seperate_tone=true
+#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
+#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
+#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+
+# dict of tranditional_to_simplified
+--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt
diff --git a/demos/TTSArmLinux/run.sh b/demos/TTSArmLinux/run.sh
index efcb61b5b..d0860f044 100755
--- a/demos/TTSArmLinux/run.sh
+++ b/demos/TTSArmLinux/run.sh
@@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")"
. ./config.sh
# create dir
-rm -rf "$OUTPUT_DIR"
mkdir -p "$OUTPUT_DIR"
# run
-for i in {1..10}; do
- (set -x; ./build/paddlespeech_tts_demo "$AM_MODEL_PATH" "$VOC_MODEL_PATH" $i "$OUTPUT_DIR/$i.wav")
-done
-
-ls -lh "$OUTPUT_DIR"/*.wav
+set -x
+./build/paddlespeech_tts_demo \
+ --front_conf "$FRONT_CONF" \
+ --acoustic_model "$ACOUSTIC_MODEL_PATH" \
+ --vocoder "$VOCODER_PATH" \
+ "$@"
+# end
diff --git a/demos/TTSArmLinux/src/CMakeLists.txt b/demos/TTSArmLinux/src/CMakeLists.txt
index e1076af92..f8240d0ce 100644
--- a/demos/TTSArmLinux/src/CMakeLists.txt
+++ b/demos/TTSArmLinux/src/CMakeLists.txt
@@ -1,4 +1,18 @@
cmake_minimum_required(VERSION 3.10)
+project(paddlespeech_tts_demo)
+
+
+########## Global Options ##########
+
+option(WITH_FRONT_DEMO "Build front demo" OFF)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+
+
+########## ARM Options ##########
+
set(CMAKE_SYSTEM_NAME Linux)
if(ARM_ABI STREQUAL "armv8")
set(CMAKE_SYSTEM_PROCESSOR aarch64)
@@ -13,14 +27,16 @@ else()
return()
endif()
-project(paddlespeech_tts_demo)
+
+########## Paddle Lite Options ##########
+
message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
include_directories(${PADDLE_LITE_DIR}/include)
link_directories(${PADDLE_LITE_DIR}/libs/${ARM_ABI})
link_directories(${PADDLE_LITE_DIR}/lib)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
if(ARM_ABI STREQUAL "armv8")
set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
@@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf")
set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
endif()
+
+########## Dependencies ##########
+
find_package(OpenMP REQUIRED)
if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@@ -43,5 +62,19 @@ else()
return()
endif()
+
+############### tts cpp frontend ###############
+
+add_subdirectory(TTSCppFrontend)
+
+include_directories(
+ TTSCppFrontend/src
+ third-party/build/src/cppjieba/include
+ third-party/build/src/limonp/include
+)
+
+
+############### paddlespeech_tts_demo ###############
+
add_executable(paddlespeech_tts_demo main.cc)
-target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared)
+target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)
diff --git a/demos/TTSArmLinux/src/Predictor.hpp b/demos/TTSArmLinux/src/Predictor.hpp
index 221d51fc1..f173abb5c 100644
--- a/demos/TTSArmLinux/src/Predictor.hpp
+++ b/demos/TTSArmLinux/src/Predictor.hpp
@@ -1,7 +1,20 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
#include
#include
-#include
#include
+#include
#include
#include
#include
@@ -9,32 +22,84 @@
using namespace paddle::lite_api;
-typedef int16_t WavDataType;
+class PredictorInterface {
+ public:
+ virtual ~PredictorInterface() = 0;
+ virtual bool Init(const std::string &AcousticModelPath,
+ const std::string &VocoderPath,
+ PowerMode cpuPowerMode,
+ int cpuThreadNum,
+ // WAV采样率(必须与模型输出匹配)
+ // 如果播放速度和音调异常,请修改采样率
+ // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
+ uint32_t wavSampleRate) = 0;
+ virtual std::shared_ptr LoadModel(
+ const std::string &modelPath,
+ int cpuThreadNum,
+ PowerMode cpuPowerMode) = 0;
+ virtual void ReleaseModel() = 0;
+ virtual bool RunModel(const std::vector &phones) = 0;
+ virtual std::unique_ptr GetAcousticModelOutput(
+ const std::vector &phones) = 0;
+ virtual std::unique_ptr GetVocoderOutput(
+ std::unique_ptr &&amOutput) = 0;
+ virtual void VocoderOutputToWav(
+ std::unique_ptr &&vocOutput) = 0;
+ virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
+ virtual bool IsLoaded() = 0;
+ virtual float GetInferenceTime() = 0;
+ virtual int GetWavSize() = 0;
+ // 获取WAV持续时间(单位:毫秒)
+ virtual float GetWavDuration() = 0;
+ // 获取RTF(合成时间 / 音频时长)
+ virtual float GetRTF() = 0;
+ virtual void ReleaseWav() = 0;
+ virtual bool WriteWavToFile(const std::string &wavPath) = 0;
+};
-class Predictor {
-public:
- bool Init(const std::string &AMModelPath, const std::string &VOCModelPath, int cpuThreadNum, const std::string &cpuPowerMode) {
+PredictorInterface::~PredictorInterface() {}
+
+// WavDataType: WAV数据类型
+// 可在 int16_t 和 float 之间切换,
+// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
+template
+class Predictor : public PredictorInterface {
+ public:
+ bool Init(const std::string &AcousticModelPath,
+ const std::string &VocoderPath,
+ PowerMode cpuPowerMode,
+ int cpuThreadNum,
+ // WAV采样率(必须与模型输出匹配)
+ // 如果播放速度和音调异常,请修改采样率
+ // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
+ uint32_t wavSampleRate) override {
// Release model if exists
ReleaseModel();
- AM_predictor_ = LoadModel(AMModelPath, cpuThreadNum, cpuPowerMode);
- if (AM_predictor_ == nullptr) {
+ acoustic_model_predictor_ =
+ LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
+ if (acoustic_model_predictor_ == nullptr) {
return false;
}
- VOC_predictor_ = LoadModel(VOCModelPath, cpuThreadNum, cpuPowerMode);
- if (VOC_predictor_ == nullptr) {
+ vocoder_predictor_ = LoadModel(VocoderPath, cpuThreadNum, cpuPowerMode);
+ if (vocoder_predictor_ == nullptr) {
return false;
}
+ wav_sample_rate_ = wavSampleRate;
+
return true;
}
- ~Predictor() {
+ virtual ~Predictor() {
ReleaseModel();
ReleaseWav();
}
- std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, const std::string &cpuPowerMode) {
+ std::shared_ptr LoadModel(
+ const std::string &modelPath,
+ int cpuThreadNum,
+ PowerMode cpuPowerMode) override {
if (modelPath.empty()) {
return nullptr;
}
@@ -43,33 +108,17 @@ public:
MobileConfig config;
config.set_model_from_file(modelPath);
config.set_threads(cpuThreadNum);
-
- if (cpuPowerMode == "LITE_POWER_HIGH") {
- config.set_power_mode(PowerMode::LITE_POWER_HIGH);
- } else if (cpuPowerMode == "LITE_POWER_LOW") {
- config.set_power_mode(PowerMode::LITE_POWER_LOW);
- } else if (cpuPowerMode == "LITE_POWER_FULL") {
- config.set_power_mode(PowerMode::LITE_POWER_FULL);
- } else if (cpuPowerMode == "LITE_POWER_NO_BIND") {
- config.set_power_mode(PowerMode::LITE_POWER_NO_BIND);
- } else if (cpuPowerMode == "LITE_POWER_RAND_HIGH") {
- config.set_power_mode(PowerMode::LITE_POWER_RAND_HIGH);
- } else if (cpuPowerMode == "LITE_POWER_RAND_LOW") {
- config.set_power_mode(PowerMode::LITE_POWER_RAND_LOW);
- } else {
- std::cerr << "Unknown cpu power mode!" << std::endl;
- return nullptr;
- }
+ config.set_power_mode(cpuPowerMode);
return CreatePaddlePredictor(config);
}
- void ReleaseModel() {
- AM_predictor_ = nullptr;
- VOC_predictor_ = nullptr;
+ void ReleaseModel() override {
+ acoustic_model_predictor_ = nullptr;
+ vocoder_predictor_ = nullptr;
}
- bool RunModel(const std::vector &phones) {
+ bool RunModel(const std::vector &phones) override {
if (!IsLoaded()) {
return false;
}
@@ -78,28 +127,29 @@ public:
auto start = std::chrono::system_clock::now();
// 执行推理
- VOCOutputToWav(GetAMOutput(phones));
+ VocoderOutputToWav(GetVocoderOutput(GetAcousticModelOutput(phones)));
// 计时结束
auto end = std::chrono::system_clock::now();
// 计算用时
std::chrono::duration duration = end - start;
- inference_time_ = duration.count() * 1000; // 单位:毫秒
+ inference_time_ = duration.count() * 1000; // 单位:毫秒
return true;
}
- std::unique_ptr GetAMOutput(const std::vector &phones) {
- auto phones_handle = AM_predictor_->GetInput(0);
+ std::unique_ptr GetAcousticModelOutput(
+ const std::vector &phones) override {
+ auto phones_handle = acoustic_model_predictor_->GetInput(0);
phones_handle->Resize({static_cast(phones.size())});
phones_handle->CopyFromCpu(phones.data());
- AM_predictor_->Run();
+ acoustic_model_predictor_->Run();
// 获取输出Tensor
- auto am_output_handle = AM_predictor_->GetOutput(0);
+ auto am_output_handle = acoustic_model_predictor_->GetOutput(0);
// 打印输出Tensor的shape
- std::cout << "AM Output shape: ";
+ std::cout << "Acoustic Model Output shape: ";
auto shape = am_output_handle->shape();
for (auto s : shape) {
std::cout << s << ", ";
@@ -109,75 +159,91 @@ public:
return am_output_handle;
}
- void VOCOutputToWav(std::unique_ptr &&input) {
- auto mel_handle = VOC_predictor_->GetInput(0);
+ std::unique_ptr GetVocoderOutput(
+ std::unique_ptr &&amOutput) override {
+ auto mel_handle = vocoder_predictor_->GetInput(0);
// [?, 80]
- auto dims = input->shape();
+ auto dims = amOutput->shape();
mel_handle->Resize(dims);
- auto am_output_data = input->mutable_data();
+ auto am_output_data = amOutput->mutable_data();
mel_handle->CopyFromCpu(am_output_data);
- VOC_predictor_->Run();
+ vocoder_predictor_->Run();
// 获取输出Tensor
- auto voc_output_handle = VOC_predictor_->GetOutput(0);
+ auto voc_output_handle = vocoder_predictor_->GetOutput(0);
// 打印输出Tensor的shape
- std::cout << "VOC Output shape: ";
+ std::cout << "Vocoder Output shape: ";
auto shape = voc_output_handle->shape();
for (auto s : shape) {
std::cout << s << ", ";
}
std::cout << std::endl;
+ return voc_output_handle;
+ }
+
+ void VocoderOutputToWav(
+ std::unique_ptr &&vocOutput) override {
// 获取输出Tensor的数据
int64_t output_size = 1;
- for (auto dim : voc_output_handle->shape()) {
+ for (auto dim : vocOutput->shape()) {
output_size *= dim;
}
- auto output_data = voc_output_handle->mutable_data();
+ auto output_data = vocOutput->mutable_data();
SaveFloatWav(output_data, output_size);
}
- inline float Abs(float number) {
- return (number < 0) ? -number : number;
- }
+ void SaveFloatWav(float *floatWav, int64_t size) override;
- void SaveFloatWav(float *floatWav, int64_t size) {
- wav_.resize(size);
- float maxSample = 0.01;
- // 寻找最大采样值
- for (int64_t i=0; i maxSample) {
- maxSample = sample;
- }
- }
- // 把采样值缩放到 int_16 范围
- for (int64_t i=0; i &GetWav() { return wav_; }
- const std::vector & GetWav() {
- return wav_;
- }
+ int GetWavSize() override { return wav_.size() * sizeof(WavDataType); }
- int GetWavSize() {
- return wav_.size() * sizeof(WavDataType);
+ // 获取WAV持续时间(单位:毫秒)
+ float GetWavDuration() override {
+ return static_cast(GetWavSize()) / sizeof(WavDataType) /
+ static_cast(wav_sample_rate_) * 1000;
}
- void ReleaseWav() {
- wav_.clear();
+ // 获取RTF(合成时间 / 音频时长)
+ float GetRTF() override { return GetInferenceTime() / GetWavDuration(); }
+
+ void ReleaseWav() override { wav_.clear(); }
+
+ bool WriteWavToFile(const std::string &wavPath) override {
+ std::ofstream fout(wavPath, std::ios::binary);
+ if (!fout.is_open()) {
+ return false;
+ }
+
+ // 写入头信息
+ WavHeader header;
+ header.audio_format = GetWavAudioFormat();
+ header.data_size = GetWavSize();
+ header.size = sizeof(header) - 8 + header.data_size;
+ header.sample_rate = wav_sample_rate_;
+ header.byte_rate = header.sample_rate * header.num_channels *
+ header.bits_per_sample / 8;
+ header.block_align = header.num_channels * header.bits_per_sample / 8;
+ fout.write(reinterpret_cast(&header), sizeof(header));
+
+ // 写入wav数据
+ fout.write(reinterpret_cast(wav_.data()),
+ header.data_size);
+
+ fout.close();
+ return true;
}
+ protected:
struct WavHeader {
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
@@ -187,15 +253,11 @@ public:
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
- uint16_t audio_format = 1; // 1为整数编码,3为浮点编码
+ uint16_t audio_format = 0;
uint16_t num_channels = 1;
-
- // 如果播放速度和音调异常,请修改采样率
- // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
- uint32_t sample_rate = 24000;
-
- uint32_t byte_rate = 64000;
- uint16_t block_align = 2;
+ uint32_t sample_rate = 0;
+ uint32_t byte_rate = 0;
+ uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// DATA 头
@@ -203,30 +265,56 @@ public:
uint32_t data_size = 0;
};
- bool WriteWavToFile(const std::string &wavPath) {
- std::ofstream fout(wavPath, std::ios::binary);
- if (!fout.is_open()) {
- return false;
- }
-
- // 写入头信息
- WavHeader header;
- header.data_size = GetWavSize();
- header.size = sizeof(header) - 8 + header.data_size;
- header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8;
- header.block_align = header.num_channels * header.bits_per_sample / 8;
- fout.write(reinterpret_cast(&header), sizeof(header));
+ enum WavAudioFormat {
+ WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
+ WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
+ };
- // 写入wav数据
- fout.write(reinterpret_cast(wav_.data()), header.data_size);
+ protected:
+ // 返回值通过模板特化由 WavDataType 决定
+ inline uint16_t GetWavAudioFormat();
- fout.close();
- return true;
- }
+ inline float Abs(float number) { return (number < 0) ? -number : number; }
-private:
+ protected:
float inference_time_ = 0;
- std::shared_ptr AM_predictor_ = nullptr;
- std::shared_ptr VOC_predictor_ = nullptr;
+ uint32_t wav_sample_rate_ = 0;
std::vector wav_;
+ std::shared_ptr acoustic_model_predictor_ = nullptr;
+ std::shared_ptr vocoder_predictor_ = nullptr;
};
+
+template <>
+uint16_t Predictor::GetWavAudioFormat() {
+ return Predictor::WAV_FORMAT_16BIT_PCM;
+}
+
+template <>
+uint16_t Predictor::GetWavAudioFormat() {
+ return Predictor::WAV_FORMAT_32BIT_FLOAT;
+}
+
+// 保存 16-bit PCM 格式 WAV
+template <>
+void Predictor::SaveFloatWav(float *floatWav, int64_t size) {
+ wav_.resize(size);
+ float maxSample = 0.01;
+ // 寻找最大采样值
+ for (int64_t i = 0; i < size; i++) {
+ float sample = Abs(floatWav[i]);
+ if (sample > maxSample) {
+ maxSample = sample;
+ }
+ }
+ // 把采样值缩放到 int_16 范围
+ for (int64_t i = 0; i < size; i++) {
+ wav_[i] = floatWav[i] * 32767.0f / maxSample;
+ }
+}
+
+// 保存 32-bit IEEE float 格式 WAV
+template <>
+void Predictor::SaveFloatWav(float *floatWav, int64_t size) {
+ wav_.resize(size);
+ std::copy_n(floatWav, size, wav_.data());
+}
diff --git a/demos/TTSArmLinux/src/TTSCppFrontend b/demos/TTSArmLinux/src/TTSCppFrontend
new file mode 120000
index 000000000..25953976d
--- /dev/null
+++ b/demos/TTSArmLinux/src/TTSCppFrontend
@@ -0,0 +1 @@
+../../TTSCppFrontend/
\ No newline at end of file
diff --git a/demos/TTSArmLinux/src/main.cc b/demos/TTSArmLinux/src/main.cc
index 0bf78a7de..0b8e26bc4 100644
--- a/demos/TTSArmLinux/src/main.cc
+++ b/demos/TTSArmLinux/src/main.cc
@@ -1,72 +1,162 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include
+#include
+#include
#include
#include
+#include