demos/TTSArmLinux: Use TTSCppFrontend as the front end to support the synthesis of any Chinese sentences

pull/3018/head
彭逸豪 3 years ago
parent bc739c7b78
commit aaca18216e

@ -1,4 +1,8 @@
# 目录
build/ build/
output/ output/
libs/ libs/
models/ models/
# 符号连接
dict

@ -45,10 +45,14 @@ cd PaddleSpeech/demos/TTSArmLinux
``` ```
./run.sh ./run.sh
./run.sh --sentence "语音合成测试"
./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
./run.sh --help
``` ```
将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件,保存在 `output` 文件夹中。 目前只支持中文合成,出现任何英文都会导致程序崩溃
如果未指定`--wav_file`,默认输出到`./output/tts.wav`。
## 手动编译 Paddle Lite 库 ## 手动编译 Paddle Lite 库

@ -0,0 +1 @@
src/TTSCppFrontend/build-depends.sh

@ -1,8 +1,11 @@
#!/bin/bash #!/bin/bash
set -e set -e
set -x
cd "$(dirname "$(realpath "$0")")" cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure # load configure
. ./config.sh . ./config.sh
@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")"
echo "ARM_ABI is ${ARM_ABI}" echo "ARM_ABI is ${ARM_ABI}"
echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}" echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"
rm -rf build echo "Build depends..."
mkdir -p build ./build-depends.sh "$@"
cd build
mkdir -p "$BASE_DIR/build"
cd "$BASE_DIR/build"
cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
make
if [ "$*" = "" ]; then
make -j$(nproc)
else
make "$@"
fi
echo "make successful!" echo "make successful!"

@ -1,8 +1,11 @@
#!/bin/bash #!/bin/bash
set -e set -e
set -x
cd "$(dirname "$(realpath "$0")")" cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure # load configure
. ./config.sh . ./config.sh
@ -12,3 +15,9 @@ set -x
rm -rf "$OUTPUT_DIR" rm -rf "$OUTPUT_DIR"
rm -rf "$LIBS_DIR" rm -rf "$LIBS_DIR"
rm -rf "$MODELS_DIR" rm -rf "$MODELS_DIR"
rm -rf "$BASE_DIR/build"
"$BASE_DIR/src/TTSCppFrontend/clean.sh"
# 符号连接
rm "$BASE_DIR/dict"

@ -12,3 +12,4 @@ PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_ext
ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb" ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb" VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
FRONT_CONF="${PWD}/front.conf"

@ -3,6 +3,8 @@ set -e
cd "$(dirname "$(realpath "$0")")" cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure # load configure
. ./config.sh . ./config.sh
@ -38,6 +40,10 @@ download() {
echo '=======================' echo '======================='
} }
########################################
echo "Download models..."
download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ 'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'39e0c6604f97c70f5d13c573d7e709b9' \ '39e0c6604f97c70f5d13c573d7e709b9' \
@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
"$MODELS_DIR" "$MODELS_DIR"
echo "Done." echo "Done."
########################################
echo "Download dictionary files..."
ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
"$BASE_DIR/src/TTSCppFrontend/download.sh"

@ -0,0 +1,21 @@
# jieba conf
--jieba_dict_path=./dict/jieba/jieba.dict.utf8
--jieba_hmm_path=./dict/jieba/hmm_model.utf8
--jieba_user_dict_path=./dict/jieba/user.dict.utf8
--jieba_idf_path=./dict/jieba/idf.utf8
--jieba_stop_word_path=./dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
--seperate_tone=false
--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
#--seperate_tone=true
#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
# dict of tranditional_to_simplified
--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt

@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")"
. ./config.sh . ./config.sh
# create dir # create dir
rm -rf "$OUTPUT_DIR"
mkdir -p "$OUTPUT_DIR" mkdir -p "$OUTPUT_DIR"
# run # run
for i in {1..10}; do set -x
(set -x; ./build/paddlespeech_tts_demo "$ACOUSTIC_MODEL_PATH" "$VOCODER_PATH" $i "$OUTPUT_DIR/$i.wav") ./build/paddlespeech_tts_demo \
done --front_conf "$FRONT_CONF" \
--acoustic_model "$ACOUSTIC_MODEL_PATH" \
ls -lh "$OUTPUT_DIR"/*.wav --vocoder "$VOCODER_PATH" \
"$@"
# end

@ -1,4 +1,18 @@
cmake_minimum_required(VERSION 3.10) cmake_minimum_required(VERSION 3.10)
project(paddlespeech_tts_demo)
########## Global Options ##########
option(WITH_FRONT_DEMO "Build front demo" OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(ABSL_PROPAGATE_CXX_STD ON)
########## ARM Options ##########
set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_NAME Linux)
if(ARM_ABI STREQUAL "armv8") if(ARM_ABI STREQUAL "armv8")
set(CMAKE_SYSTEM_PROCESSOR aarch64) set(CMAKE_SYSTEM_PROCESSOR aarch64)
@ -13,7 +27,9 @@ else()
return() return()
endif() endif()
project(paddlespeech_tts_demo)
########## Paddle Lite Options ##########
message(STATUS "TARGET ARCH ABI: ${ARM_ABI}") message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}") message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf")
set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" ) set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
endif() endif()
########## Dependencies ##########
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
if(OpenMP_FOUND OR OpenMP_CXX_FOUND) if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@ -43,5 +62,19 @@ else()
return() return()
endif() endif()
############### tts cpp frontend ###############
add_subdirectory(TTSCppFrontend)
include_directories(
TTSCppFrontend/src
third-party/build/src/cppjieba/include
third-party/build/src/limonp/include
)
############### paddlespeech_tts_demo ###############
add_executable(paddlespeech_tts_demo main.cc) add_executable(paddlespeech_tts_demo main.cc)
target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared) target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)

@ -9,42 +9,43 @@
using namespace paddle::lite_api; using namespace paddle::lite_api;
class PredictorInterface {
public:
virtual bool Init(
const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
int cpuThreadNum,
// WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t wavSampleRate
) = 0;
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0;
virtual void ReleaseModel() = 0;
virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0;
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0;
virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
virtual bool IsLoaded() = 0;
virtual float GetInferenceTime() = 0;
virtual int GetWavSize() = 0;
// 获取WAV持续时间单位毫秒
virtual float GetWavDuration() = 0;
// 获取RTF合成时间 / 音频时长)
virtual float GetRTF() = 0;
virtual void ReleaseWav() = 0;
virtual bool WriteWavToFile(const std::string &wavPath) = 0;
};
// WavDataType: WAV数据类型 // WavDataType: WAV数据类型
// 可在 int16_t 和 float 之间切换, // 可在 int16_t 和 float 之间切换,
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
template<typename WavDataType> template<typename WavDataType>
class Predictor { class Predictor : public PredictorInterface {
public: public:
struct WavHeader { virtual bool Init(
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 0;
uint16_t num_channels = 1;
uint32_t sample_rate = 0;
uint32_t byte_rate = 0;
uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// DATA 头
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
};
// 返回值通过模板特化由 WavDataType 决定
inline uint16_t GetWavAudioFormat();
bool Init(
const std::string &AcousticModelPath, const std::string &AcousticModelPath,
const std::string &VocoderPath, const std::string &VocoderPath,
PowerMode cpuPowerMode, PowerMode cpuPowerMode,
@ -53,7 +54,7 @@ public:
// 如果播放速度和音调异常,请修改采样率 // 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000 // 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t wavSampleRate uint32_t wavSampleRate
) { ) override {
// Release model if exists // Release model if exists
ReleaseModel(); ReleaseModel();
@ -71,12 +72,12 @@ public:
return true; return true;
} }
~Predictor() { virtual ~Predictor() {
ReleaseModel(); ReleaseModel();
ReleaseWav(); ReleaseWav();
} }
std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) { virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override {
if (modelPath.empty()) { if (modelPath.empty()) {
return nullptr; return nullptr;
} }
@ -90,12 +91,12 @@ public:
return CreatePaddlePredictor<MobileConfig>(config); return CreatePaddlePredictor<MobileConfig>(config);
} }
void ReleaseModel() { virtual void ReleaseModel() override {
acoustic_model_predictor_ = nullptr; acoustic_model_predictor_ = nullptr;
vocoder_predictor_ = nullptr; vocoder_predictor_ = nullptr;
} }
bool RunModel(const std::vector<int64_t> &phones) { virtual bool RunModel(const std::vector<int64_t> &phones) override {
if (!IsLoaded()) { if (!IsLoaded()) {
return false; return false;
} }
@ -116,7 +117,7 @@ public:
return true; return true;
} }
std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) { virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override {
auto phones_handle = acoustic_model_predictor_->GetInput(0); auto phones_handle = acoustic_model_predictor_->GetInput(0);
phones_handle->Resize({static_cast<int64_t>(phones.size())}); phones_handle->Resize({static_cast<int64_t>(phones.size())});
phones_handle->CopyFromCpu(phones.data()); phones_handle->CopyFromCpu(phones.data());
@ -135,7 +136,7 @@ public:
return am_output_handle; return am_output_handle;
} }
std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) { virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override {
auto mel_handle = vocoder_predictor_->GetInput(0); auto mel_handle = vocoder_predictor_->GetInput(0);
// [?, 80] // [?, 80]
auto dims = amOutput->shape(); auto dims = amOutput->shape();
@ -157,7 +158,7 @@ public:
return voc_output_handle; return voc_output_handle;
} }
void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) { virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override {
// 获取输出Tensor的数据 // 获取输出Tensor的数据
int64_t output_size = 1; int64_t output_size = 1;
for (auto dim : vocOutput->shape()) { for (auto dim : vocOutput->shape()) {
@ -168,17 +169,13 @@ public:
SaveFloatWav(output_data, output_size); SaveFloatWav(output_data, output_size);
} }
inline float Abs(float number) { virtual void SaveFloatWav(float *floatWav, int64_t size) override;
return (number < 0) ? -number : number;
}
void SaveFloatWav(float *floatWav, int64_t size);
bool IsLoaded() { virtual bool IsLoaded() override {
return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr; return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr;
} }
float GetInferenceTime() { virtual float GetInferenceTime() override {
return inference_time_; return inference_time_;
} }
@ -186,25 +183,25 @@ public:
return wav_; return wav_;
} }
int GetWavSize() { virtual int GetWavSize() override {
return wav_.size() * sizeof(WavDataType); return wav_.size() * sizeof(WavDataType);
} }
// 获取WAV持续时间单位毫秒 // 获取WAV持续时间单位毫秒
float GetWavDuration() { virtual float GetWavDuration() override {
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000; return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000;
} }
// 获取RTF合成时间 / 音频时长) // 获取RTF合成时间 / 音频时长)
float GetRTF() { virtual float GetRTF() override {
return GetInferenceTime() / GetWavDuration(); return GetInferenceTime() / GetWavDuration();
} }
void ReleaseWav() { virtual void ReleaseWav() override {
wav_.clear(); wav_.clear();
} }
bool WriteWavToFile(const std::string &wavPath) { virtual bool WriteWavToFile(const std::string &wavPath) override {
std::ofstream fout(wavPath, std::ios::binary); std::ofstream fout(wavPath, std::ios::binary);
if (!fout.is_open()) { if (!fout.is_open()) {
return false; return false;
@ -227,7 +224,42 @@ public:
return true; return true;
} }
private: protected:
struct WavHeader {
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 0;
uint16_t num_channels = 1;
uint32_t sample_rate = 0;
uint32_t byte_rate = 0;
uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// DATA 头
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
};
protected:
// 返回值通过模板特化由 WavDataType 决定
inline uint16_t GetWavAudioFormat();
inline float Abs(float number) {
return (number < 0) ? -number : number;
}
protected:
float inference_time_ = 0; float inference_time_ = 0;
uint32_t wav_sample_rate_ = 0; uint32_t wav_sample_rate_ = 0;
std::vector<WavDataType> wav_; std::vector<WavDataType> wav_;

@ -0,0 +1 @@
../../TTSCppFrontend/

@ -1,90 +1,128 @@
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include "paddle_api.h" #include <string>
#include <map>
#include <glog/logging.h>
#include <gflags/gflags.h>
#include <paddle_api.h>
#include <front/front_interface.h>
#include "Predictor.hpp" #include "Predictor.hpp"
using namespace paddle::lite_api; using namespace paddle::lite_api;
std::vector<std::vector<int64_t>> sentencesToChoose = { DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)");
// 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。 DEFINE_string(front_conf, "./front.conf", "Front configuration file");
{261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141}, DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file");
// 009902 钱伟长想到上海来办学校是经过深思熟虑的。 DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file");
{174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45}, DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
// 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
{182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168}, DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder");
// 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。 DEFINE_string(cpu_thread, "1", "CPU thread numbers");
{153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
// 009905 这种车票和保险单捆绑出售属于重复性购买。
{262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
// 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。
{40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
// 009907 观大势、谋大局、出大策始终是该院的办院方针。
{70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
// 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。
{182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
// 009909 但是因为还没到退休年龄,只能掰着指头捱日子。
{40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
// 009910 这几天雨水不断,人们恨不得待在家里不出门。
{262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52},
};
void usage(const char *binName) {
std::cerr << "Usage:" << std::endl
<< "\t" << binName << " <AM-model-path> <VOC-model-path> <sentences-index:1-10> <output-wav-path>" << std::endl;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
if (argc < 5) { gflags::ParseCommandLineFlags(&argc, &argv, true);
usage(argv[0]);
PredictorInterface *predictor;
if (FLAGS_wav_bit_depth == "16") {
predictor = new Predictor<int16_t>();
} else if (FLAGS_wav_bit_depth == "32") {
predictor = new Predictor<float>();
} else {
LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
return -1;
}
/////////////////////////// 前端:文本转音素 ///////////////////////////
// 实例化文本前端引擎
speechnn::FrontEngineInterface *front_inst = nullptr;
front_inst = new speechnn::FrontEngineInterface(FLAGS_front_conf);
if ((!front_inst) || (front_inst->init())) {
LOG(ERROR) << "Creater tts engine failed!";
if (front_inst != nullptr) {
delete front_inst;
}
front_inst = nullptr;
return -1; return -1;
} }
const char *AMModelPath = argv[1];
const char *VOCModelPath = argv[2];
int sentencesIndex = atoi(argv[3]) - 1;
const char *outputWavPath = argv[4];
if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) { std::wstring ws_sentence = speechnn::utf8string2wstring(FLAGS_sentence);
std::cerr << "sentences-index out of range" << std::endl;
// 繁体转简体
std::wstring sentence_simp;
front_inst->Trand2Simp(ws_sentence, sentence_simp);
ws_sentence = sentence_simp;
std::string s_sentence;
std::vector<std::wstring> sentence_part;
std::vector<int> phoneids = {};
std::vector<int> toneids = {};
// 根据标点进行分句
LOG(INFO) << "Start to segment sentences by punctuation";
front_inst->SplitByPunc(ws_sentence, sentence_part);
LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
for(int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: " << speechnn::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(sentence_part[i]);
s_sentence = speechnn::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence;
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1; return -1;
} }
// 模板参数WAV数据类型 }
// 可在 int16_t 和 float 之间切换, LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
Predictor<int16_t> predictor; LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
//Predictor<float> predictor;
/////////////////////////// 后端:音素转音频 ///////////////////////////
// WAV采样率必须与模型输出匹配 // WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率 // 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000 // 常见采样率16000, 24000, 32000, 44100, 48000, 96000
const uint32_t wavSampleRate = 24000; const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);
// CPU线程数 // CPU线程数
const int cpuThreadNum = 1; const int cpuThreadNum = std::stol(FLAGS_cpu_thread);
// CPU电源模式 // CPU电源模式
const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH; const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
if (!predictor.Init(AMModelPath, VOCModelPath, cpuPowerMode, cpuThreadNum, wavSampleRate)) { if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
std::cerr << "predictor init failed" << std::endl; LOG(ERROR) << "predictor init failed" << std::endl;
return -1; return -1;
} }
if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) { std::vector<int64_t> phones(phoneids.size());
std::cerr << "predictor run model failed" << std::endl; std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); });
if (!predictor->RunModel(phones)) {
LOG(ERROR) << "predictor run model failed" << std::endl;
return -1; return -1;
} }
std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, " LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
<< "WAV size (without header): " << predictor.GetWavSize() << " bytes, " << "WAV size (without header): " << predictor->GetWavSize() << " bytes, "
<< "WAV duration: " << predictor.GetWavDuration() << " ms, " << "WAV duration: " << predictor->GetWavDuration() << " ms, "
<< "RTF: " << predictor.GetRTF() << std::endl; << "RTF: " << predictor->GetRTF() << std::endl;
if (!predictor.WriteWavToFile(outputWavPath)) { if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
std::cerr << "write wav file failed" << std::endl; LOG(ERROR) << "write wav file failed" << std::endl;
return -1; return -1;
} }
delete predictor;
return 0; return 0;
} }

@ -0,0 +1 @@
TTSCppFrontend/third-party
Loading…
Cancel
Save