demos/TTSArmLinux: Use TTSCppFrontend as the front end to support the synthesis of any Chinese sentences

pull/3018/head
彭逸豪 3 years ago
parent bc739c7b78
commit aaca18216e

@ -1,4 +1,8 @@
# 目录
build/
output/
libs/
models/
# 符号连接
dict

@ -45,10 +45,14 @@ cd PaddleSpeech/demos/TTSArmLinux
```
./run.sh
./run.sh --sentence "语音合成测试"
./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
./run.sh --help
```
将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件,保存在 `output` 文件夹中。
目前只支持中文合成,出现任何英文都会导致程序崩溃
如果未指定`--wav_file`,默认输出到`./output/tts.wav`。
## 手动编译 Paddle Lite 库

@ -0,0 +1 @@
src/TTSCppFrontend/build-depends.sh

@ -1,8 +1,11 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure
. ./config.sh
@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")"
echo "ARM_ABI is ${ARM_ABI}"
echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"
rm -rf build
mkdir -p build
cd build
echo "Build depends..."
./build-depends.sh "$@"
mkdir -p "$BASE_DIR/build"
cd "$BASE_DIR/build"
cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
make
if [ "$*" = "" ]; then
make -j$(nproc)
else
make "$@"
fi
echo "make successful!"

@ -1,8 +1,11 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure
. ./config.sh
@ -12,3 +15,9 @@ set -x
rm -rf "$OUTPUT_DIR"
rm -rf "$LIBS_DIR"
rm -rf "$MODELS_DIR"
rm -rf "$BASE_DIR/build"
"$BASE_DIR/src/TTSCppFrontend/clean.sh"
# 符号连接
rm "$BASE_DIR/dict"

@ -12,3 +12,4 @@ PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_ext
ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
FRONT_CONF="${PWD}/front.conf"

@ -3,6 +3,8 @@ set -e
cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure
. ./config.sh
@ -38,6 +40,10 @@ download() {
echo '======================='
}
########################################
echo "Download models..."
download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'39e0c6604f97c70f5d13c573d7e709b9' \
@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
"$MODELS_DIR"
echo "Done."
########################################
echo "Download dictionary files..."
ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
"$BASE_DIR/src/TTSCppFrontend/download.sh"

@ -0,0 +1,21 @@
# jieba conf
--jieba_dict_path=./dict/jieba/jieba.dict.utf8
--jieba_hmm_path=./dict/jieba/hmm_model.utf8
--jieba_user_dict_path=./dict/jieba/user.dict.utf8
--jieba_idf_path=./dict/jieba/idf.utf8
--jieba_stop_word_path=./dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
--seperate_tone=false
--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
#--seperate_tone=true
#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
# dict of tranditional_to_simplified
--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt

@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")"
. ./config.sh
# create dir
rm -rf "$OUTPUT_DIR"
mkdir -p "$OUTPUT_DIR"
# run
for i in {1..10}; do
(set -x; ./build/paddlespeech_tts_demo "$ACOUSTIC_MODEL_PATH" "$VOCODER_PATH" $i "$OUTPUT_DIR/$i.wav")
done
ls -lh "$OUTPUT_DIR"/*.wav
set -x
./build/paddlespeech_tts_demo \
--front_conf "$FRONT_CONF" \
--acoustic_model "$ACOUSTIC_MODEL_PATH" \
--vocoder "$VOCODER_PATH" \
"$@"
# end

@ -1,4 +1,18 @@
cmake_minimum_required(VERSION 3.10)
project(paddlespeech_tts_demo)
########## Global Options ##########
option(WITH_FRONT_DEMO "Build front demo" OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(ABSL_PROPAGATE_CXX_STD ON)
########## ARM Options ##########
set(CMAKE_SYSTEM_NAME Linux)
if(ARM_ABI STREQUAL "armv8")
set(CMAKE_SYSTEM_PROCESSOR aarch64)
@ -13,7 +27,9 @@ else()
return()
endif()
project(paddlespeech_tts_demo)
########## Paddle Lite Options ##########
message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf")
set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
endif()
########## Dependencies ##########
find_package(OpenMP REQUIRED)
if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@ -43,5 +62,19 @@ else()
return()
endif()
############### tts cpp frontend ###############
add_subdirectory(TTSCppFrontend)
include_directories(
TTSCppFrontend/src
third-party/build/src/cppjieba/include
third-party/build/src/limonp/include
)
############### paddlespeech_tts_demo ###############
add_executable(paddlespeech_tts_demo main.cc)
target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared)
target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)

@ -9,42 +9,43 @@
using namespace paddle::lite_api;
class PredictorInterface {
public:
virtual bool Init(
const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
int cpuThreadNum,
// WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t wavSampleRate
) = 0;
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0;
virtual void ReleaseModel() = 0;
virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0;
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0;
virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
virtual bool IsLoaded() = 0;
virtual float GetInferenceTime() = 0;
virtual int GetWavSize() = 0;
// 获取WAV持续时间单位毫秒
virtual float GetWavDuration() = 0;
// 获取RTF合成时间 / 音频时长)
virtual float GetRTF() = 0;
virtual void ReleaseWav() = 0;
virtual bool WriteWavToFile(const std::string &wavPath) = 0;
};
// WavDataType: WAV数据类型
// 可在 int16_t 和 float 之间切换,
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
template<typename WavDataType>
class Predictor {
class Predictor : public PredictorInterface {
public:
struct WavHeader {
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 0;
uint16_t num_channels = 1;
uint32_t sample_rate = 0;
uint32_t byte_rate = 0;
uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// DATA 头
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
};
// 返回值通过模板特化由 WavDataType 决定
inline uint16_t GetWavAudioFormat();
bool Init(
virtual bool Init(
const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
@ -53,7 +54,7 @@ public:
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t wavSampleRate
) {
) override {
// Release model if exists
ReleaseModel();
@ -71,12 +72,12 @@ public:
return true;
}
~Predictor() {
virtual ~Predictor() {
ReleaseModel();
ReleaseWav();
}
std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) {
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override {
if (modelPath.empty()) {
return nullptr;
}
@ -90,12 +91,12 @@ public:
return CreatePaddlePredictor<MobileConfig>(config);
}
void ReleaseModel() {
virtual void ReleaseModel() override {
acoustic_model_predictor_ = nullptr;
vocoder_predictor_ = nullptr;
}
bool RunModel(const std::vector<int64_t> &phones) {
virtual bool RunModel(const std::vector<int64_t> &phones) override {
if (!IsLoaded()) {
return false;
}
@ -116,7 +117,7 @@ public:
return true;
}
std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) {
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override {
auto phones_handle = acoustic_model_predictor_->GetInput(0);
phones_handle->Resize({static_cast<int64_t>(phones.size())});
phones_handle->CopyFromCpu(phones.data());
@ -135,7 +136,7 @@ public:
return am_output_handle;
}
std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) {
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override {
auto mel_handle = vocoder_predictor_->GetInput(0);
// [?, 80]
auto dims = amOutput->shape();
@ -157,7 +158,7 @@ public:
return voc_output_handle;
}
void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) {
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override {
// 获取输出Tensor的数据
int64_t output_size = 1;
for (auto dim : vocOutput->shape()) {
@ -168,17 +169,13 @@ public:
SaveFloatWav(output_data, output_size);
}
inline float Abs(float number) {
return (number < 0) ? -number : number;
}
void SaveFloatWav(float *floatWav, int64_t size);
virtual void SaveFloatWav(float *floatWav, int64_t size) override;
bool IsLoaded() {
virtual bool IsLoaded() override {
return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr;
}
float GetInferenceTime() {
virtual float GetInferenceTime() override {
return inference_time_;
}
@ -186,25 +183,25 @@ public:
return wav_;
}
int GetWavSize() {
virtual int GetWavSize() override {
return wav_.size() * sizeof(WavDataType);
}
// 获取WAV持续时间单位毫秒
float GetWavDuration() {
virtual float GetWavDuration() override {
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000;
}
// 获取RTF合成时间 / 音频时长)
float GetRTF() {
virtual float GetRTF() override {
return GetInferenceTime() / GetWavDuration();
}
void ReleaseWav() {
virtual void ReleaseWav() override {
wav_.clear();
}
bool WriteWavToFile(const std::string &wavPath) {
virtual bool WriteWavToFile(const std::string &wavPath) override {
std::ofstream fout(wavPath, std::ios::binary);
if (!fout.is_open()) {
return false;
@ -227,7 +224,42 @@ public:
return true;
}
private:
protected:
struct WavHeader {
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 0;
uint16_t num_channels = 1;
uint32_t sample_rate = 0;
uint32_t byte_rate = 0;
uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// DATA 头
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
};
protected:
// 返回值通过模板特化由 WavDataType 决定
inline uint16_t GetWavAudioFormat();
inline float Abs(float number) {
return (number < 0) ? -number : number;
}
protected:
float inference_time_ = 0;
uint32_t wav_sample_rate_ = 0;
std::vector<WavDataType> wav_;

@ -0,0 +1 @@
../../TTSCppFrontend/

@ -1,90 +1,128 @@
#include <cstdlib>
#include <iostream>
#include <memory>
#include "paddle_api.h"
#include <string>
#include <map>
#include <glog/logging.h>
#include <gflags/gflags.h>
#include <paddle_api.h>
#include <front/front_interface.h>
#include "Predictor.hpp"
using namespace paddle::lite_api;
std::vector<std::vector<int64_t>> sentencesToChoose = {
// 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。
{261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141},
// 009902 钱伟长想到上海来办学校是经过深思熟虑的。
{174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45},
// 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。
{182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168},
// 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。
{153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
// 009905 这种车票和保险单捆绑出售属于重复性购买。
{262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
// 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。
{40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
// 009907 观大势、谋大局、出大策始终是该院的办院方针。
{70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
// 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。
{182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
// 009909 但是因为还没到退休年龄,只能掰着指头捱日子。
{40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
// 009910 这几天雨水不断,人们恨不得待在家里不出门。
{262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52},
};
void usage(const char *binName) {
std::cerr << "Usage:" << std::endl
<< "\t" << binName << " <AM-model-path> <VOC-model-path> <sentences-index:1-10> <output-wav-path>" << std::endl;
}
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)");
DEFINE_string(front_conf, "./front.conf", "Front configuration file");
DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file");
DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file");
DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder");
DEFINE_string(cpu_thread, "1", "CPU thread numbers");
int main(int argc, char *argv[]) {
if (argc < 5) {
usage(argv[0]);
gflags::ParseCommandLineFlags(&argc, &argv, true);
PredictorInterface *predictor;
if (FLAGS_wav_bit_depth == "16") {
predictor = new Predictor<int16_t>();
} else if (FLAGS_wav_bit_depth == "32") {
predictor = new Predictor<float>();
} else {
LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
return -1;
}
const char *AMModelPath = argv[1];
const char *VOCModelPath = argv[2];
int sentencesIndex = atoi(argv[3]) - 1;
const char *outputWavPath = argv[4];
if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) {
std::cerr << "sentences-index out of range" << std::endl;
/////////////////////////// 前端:文本转音素 ///////////////////////////
// 实例化文本前端引擎
speechnn::FrontEngineInterface *front_inst = nullptr;
front_inst = new speechnn::FrontEngineInterface(FLAGS_front_conf);
if ((!front_inst) || (front_inst->init())) {
LOG(ERROR) << "Creater tts engine failed!";
if (front_inst != nullptr) {
delete front_inst;
}
front_inst = nullptr;
return -1;
}
// 模板参数WAV数据类型
// 可在 int16_t 和 float 之间切换,
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
Predictor<int16_t> predictor;
//Predictor<float> predictor;
std::wstring ws_sentence = speechnn::utf8string2wstring(FLAGS_sentence);
// 繁体转简体
std::wstring sentence_simp;
front_inst->Trand2Simp(ws_sentence, sentence_simp);
ws_sentence = sentence_simp;
std::string s_sentence;
std::vector<std::wstring> sentence_part;
std::vector<int> phoneids = {};
std::vector<int> toneids = {};
// 根据标点进行分句
LOG(INFO) << "Start to segment sentences by punctuation";
front_inst->SplitByPunc(ws_sentence, sentence_part);
LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
for(int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: " << speechnn::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(sentence_part[i]);
s_sentence = speechnn::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence;
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1;
}
}
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
/////////////////////////// 后端:音素转音频 ///////////////////////////
// WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
const uint32_t wavSampleRate = 24000;
const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);
// CPU线程数
const int cpuThreadNum = 1;
const int cpuThreadNum = std::stol(FLAGS_cpu_thread);
// CPU电源模式
const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
if (!predictor.Init(AMModelPath, VOCModelPath, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
std::cerr << "predictor init failed" << std::endl;
if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
LOG(ERROR) << "predictor init failed" << std::endl;
return -1;
}
if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) {
std::cerr << "predictor run model failed" << std::endl;
std::vector<int64_t> phones(phoneids.size());
std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); });
if (!predictor->RunModel(phones)) {
LOG(ERROR) << "predictor run model failed" << std::endl;
return -1;
}
std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, "
<< "WAV size (without header): " << predictor.GetWavSize() << " bytes, "
<< "WAV duration: " << predictor.GetWavDuration() << " ms, "
<< "RTF: " << predictor.GetRTF() << std::endl;
LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
<< "WAV size (without header): " << predictor->GetWavSize() << " bytes, "
<< "WAV duration: " << predictor->GetWavDuration() << " ms, "
<< "RTF: " << predictor->GetRTF() << std::endl;
if (!predictor.WriteWavToFile(outputWavPath)) {
std::cerr << "write wav file failed" << std::endl;
if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
LOG(ERROR) << "write wav file failed" << std::endl;
return -1;
}
delete predictor;
return 0;
}

@ -0,0 +1 @@
TTSCppFrontend/third-party
Loading…
Cancel
Save