|
|
|
@ -1,90 +1,128 @@
|
|
|
|
|
#include <cstdlib>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <memory>
|
|
|
|
|
#include "paddle_api.h"
|
|
|
|
|
#include <string>
|
|
|
|
|
#include <map>
|
|
|
|
|
#include <glog/logging.h>
|
|
|
|
|
#include <gflags/gflags.h>
|
|
|
|
|
#include <paddle_api.h>
|
|
|
|
|
#include <front/front_interface.h>
|
|
|
|
|
#include "Predictor.hpp"
|
|
|
|
|
|
|
|
|
|
using namespace paddle::lite_api;
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<int64_t>> sentencesToChoose = {
|
|
|
|
|
// 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。
|
|
|
|
|
{261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141},
|
|
|
|
|
// 009902 钱伟长想到上海来办学校是经过深思熟虑的。
|
|
|
|
|
{174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45},
|
|
|
|
|
// 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。
|
|
|
|
|
{182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168},
|
|
|
|
|
// 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。
|
|
|
|
|
{153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
|
|
|
|
|
// 009905 这种车票和保险单捆绑出售属于重复性购买。
|
|
|
|
|
{262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
|
|
|
|
|
// 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。
|
|
|
|
|
{40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
|
|
|
|
|
// 009907 观大势、谋大局、出大策始终是该院的办院方针。
|
|
|
|
|
{70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
|
|
|
|
|
// 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。
|
|
|
|
|
{182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
|
|
|
|
|
// 009909 但是因为还没到退休年龄,只能掰着指头捱日子。
|
|
|
|
|
{40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
|
|
|
|
|
// 009910 这几天雨水不断,人们恨不得待在家里不出门。
|
|
|
|
|
{262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void usage(const char *binName) {
|
|
|
|
|
std::cerr << "Usage:" << std::endl
|
|
|
|
|
<< "\t" << binName << " <AM-model-path> <VOC-model-path> <sentences-index:1-10> <output-wav-path>" << std::endl;
|
|
|
|
|
}
|
|
|
|
|
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)");
|
|
|
|
|
DEFINE_string(front_conf, "./front.conf", "Front configuration file");
|
|
|
|
|
DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file");
|
|
|
|
|
DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file");
|
|
|
|
|
DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
|
|
|
|
|
DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
|
|
|
|
|
DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder");
|
|
|
|
|
DEFINE_string(cpu_thread, "1", "CPU thread numbers");
|
|
|
|
|
|
|
|
|
|
int main(int argc, char *argv[]) {
|
|
|
|
|
if (argc < 5) {
|
|
|
|
|
usage(argv[0]);
|
|
|
|
|
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
|
|
|
|
|
|
|
|
|
PredictorInterface *predictor;
|
|
|
|
|
|
|
|
|
|
if (FLAGS_wav_bit_depth == "16") {
|
|
|
|
|
predictor = new Predictor<int16_t>();
|
|
|
|
|
} else if (FLAGS_wav_bit_depth == "32") {
|
|
|
|
|
predictor = new Predictor<float>();
|
|
|
|
|
} else {
|
|
|
|
|
LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
const char *AMModelPath = argv[1];
|
|
|
|
|
const char *VOCModelPath = argv[2];
|
|
|
|
|
int sentencesIndex = atoi(argv[3]) - 1;
|
|
|
|
|
const char *outputWavPath = argv[4];
|
|
|
|
|
|
|
|
|
|
if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) {
|
|
|
|
|
std::cerr << "sentences-index out of range" << std::endl;
|
|
|
|
|
|
|
|
|
|
/////////////////////////// 前端:文本转音素 ///////////////////////////
|
|
|
|
|
|
|
|
|
|
// 实例化文本前端引擎
|
|
|
|
|
speechnn::FrontEngineInterface *front_inst = nullptr;
|
|
|
|
|
front_inst = new speechnn::FrontEngineInterface(FLAGS_front_conf);
|
|
|
|
|
if ((!front_inst) || (front_inst->init())) {
|
|
|
|
|
LOG(ERROR) << "Creater tts engine failed!";
|
|
|
|
|
if (front_inst != nullptr) {
|
|
|
|
|
delete front_inst;
|
|
|
|
|
}
|
|
|
|
|
front_inst = nullptr;
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 模板参数:WAV数据类型
|
|
|
|
|
// 可在 int16_t 和 float 之间切换,
|
|
|
|
|
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
|
|
|
|
|
Predictor<int16_t> predictor;
|
|
|
|
|
//Predictor<float> predictor;
|
|
|
|
|
std::wstring ws_sentence = speechnn::utf8string2wstring(FLAGS_sentence);
|
|
|
|
|
|
|
|
|
|
// 繁体转简体
|
|
|
|
|
std::wstring sentence_simp;
|
|
|
|
|
front_inst->Trand2Simp(ws_sentence, sentence_simp);
|
|
|
|
|
ws_sentence = sentence_simp;
|
|
|
|
|
|
|
|
|
|
std::string s_sentence;
|
|
|
|
|
std::vector<std::wstring> sentence_part;
|
|
|
|
|
std::vector<int> phoneids = {};
|
|
|
|
|
std::vector<int> toneids = {};
|
|
|
|
|
|
|
|
|
|
// 根据标点进行分句
|
|
|
|
|
LOG(INFO) << "Start to segment sentences by punctuation";
|
|
|
|
|
front_inst->SplitByPunc(ws_sentence, sentence_part);
|
|
|
|
|
LOG(INFO) << "Segment sentences through punctuation successfully";
|
|
|
|
|
|
|
|
|
|
// 分句后获取音素id
|
|
|
|
|
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
|
|
|
|
|
for(int i = 0; i < sentence_part.size(); i++) {
|
|
|
|
|
|
|
|
|
|
LOG(INFO) << "Raw sentence is: " << speechnn::wstring2utf8string(sentence_part[i]);
|
|
|
|
|
front_inst->SentenceNormalize(sentence_part[i]);
|
|
|
|
|
s_sentence = speechnn::wstring2utf8string(sentence_part[i]);
|
|
|
|
|
LOG(INFO) << "After normalization sentence is: " << s_sentence;
|
|
|
|
|
|
|
|
|
|
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
|
|
|
|
|
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
|
|
|
|
|
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
|
|
|
|
|
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/////////////////////////// 后端:音素转音频 ///////////////////////////
|
|
|
|
|
|
|
|
|
|
// WAV采样率(必须与模型输出匹配)
|
|
|
|
|
// 如果播放速度和音调异常,请修改采样率
|
|
|
|
|
// 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
|
|
|
|
|
const uint32_t wavSampleRate = 24000;
|
|
|
|
|
const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);
|
|
|
|
|
|
|
|
|
|
// CPU线程数
|
|
|
|
|
const int cpuThreadNum = 1;
|
|
|
|
|
const int cpuThreadNum = std::stol(FLAGS_cpu_thread);
|
|
|
|
|
|
|
|
|
|
// CPU电源模式
|
|
|
|
|
const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
|
|
|
|
|
|
|
|
|
|
if (!predictor.Init(AMModelPath, VOCModelPath, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
|
|
|
|
|
std::cerr << "predictor init failed" << std::endl;
|
|
|
|
|
if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
|
|
|
|
|
LOG(ERROR) << "predictor init failed" << std::endl;
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) {
|
|
|
|
|
std::cerr << "predictor run model failed" << std::endl;
|
|
|
|
|
std::vector<int64_t> phones(phoneids.size());
|
|
|
|
|
std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); });
|
|
|
|
|
|
|
|
|
|
if (!predictor->RunModel(phones)) {
|
|
|
|
|
LOG(ERROR) << "predictor run model failed" << std::endl;
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, "
|
|
|
|
|
<< "WAV size (without header): " << predictor.GetWavSize() << " bytes, "
|
|
|
|
|
<< "WAV duration: " << predictor.GetWavDuration() << " ms, "
|
|
|
|
|
<< "RTF: " << predictor.GetRTF() << std::endl;
|
|
|
|
|
LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
|
|
|
|
|
<< "WAV size (without header): " << predictor->GetWavSize() << " bytes, "
|
|
|
|
|
<< "WAV duration: " << predictor->GetWavDuration() << " ms, "
|
|
|
|
|
<< "RTF: " << predictor->GetRTF() << std::endl;
|
|
|
|
|
|
|
|
|
|
if (!predictor.WriteWavToFile(outputWavPath)) {
|
|
|
|
|
std::cerr << "write wav file failed" << std::endl;
|
|
|
|
|
if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
|
|
|
|
|
LOG(ERROR) << "write wav file failed" << std::endl;
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
delete predictor;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|