|
|
|
@ -1,7 +1,20 @@
|
|
|
|
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
|
// limitations under the License.
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
#include <chrono>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <fstream>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <memory>
|
|
|
|
|
#include <string>
|
|
|
|
|
#include <vector>
|
|
|
|
@ -12,22 +25,26 @@ using namespace paddle::lite_api;
|
|
|
|
|
class PredictorInterface {
|
|
|
|
|
public:
|
|
|
|
|
virtual ~PredictorInterface() = 0;
|
|
|
|
|
virtual bool Init(
|
|
|
|
|
const std::string &AcousticModelPath,
|
|
|
|
|
virtual bool Init(const std::string &AcousticModelPath,
|
|
|
|
|
const std::string &VocoderPath,
|
|
|
|
|
PowerMode cpuPowerMode,
|
|
|
|
|
int cpuThreadNum,
|
|
|
|
|
// WAV采样率(必须与模型输出匹配)
|
|
|
|
|
// 如果播放速度和音调异常,请修改采样率
|
|
|
|
|
// 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
|
|
|
|
|
uint32_t wavSampleRate
|
|
|
|
|
) = 0;
|
|
|
|
|
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0;
|
|
|
|
|
uint32_t wavSampleRate) = 0;
|
|
|
|
|
virtual std::shared_ptr<PaddlePredictor> LoadModel(
|
|
|
|
|
const std::string &modelPath,
|
|
|
|
|
int cpuThreadNum,
|
|
|
|
|
PowerMode cpuPowerMode) = 0;
|
|
|
|
|
virtual void ReleaseModel() = 0;
|
|
|
|
|
virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0;
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0;
|
|
|
|
|
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0;
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(
|
|
|
|
|
const std::vector<int64_t> &phones) = 0;
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetVocoderOutput(
|
|
|
|
|
std::unique_ptr<const Tensor> &&amOutput) = 0;
|
|
|
|
|
virtual void VocoderOutputToWav(
|
|
|
|
|
std::unique_ptr<const Tensor> &&vocOutput) = 0;
|
|
|
|
|
virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
|
|
|
|
|
virtual bool IsLoaded() = 0;
|
|
|
|
|
virtual float GetInferenceTime() = 0;
|
|
|
|
@ -48,20 +65,19 @@ PredictorInterface::~PredictorInterface() {}
|
|
|
|
|
template <typename WavDataType>
|
|
|
|
|
class Predictor : public PredictorInterface {
|
|
|
|
|
public:
|
|
|
|
|
virtual bool Init(
|
|
|
|
|
const std::string &AcousticModelPath,
|
|
|
|
|
virtual bool Init(const std::string &AcousticModelPath,
|
|
|
|
|
const std::string &VocoderPath,
|
|
|
|
|
PowerMode cpuPowerMode,
|
|
|
|
|
int cpuThreadNum,
|
|
|
|
|
// WAV采样率(必须与模型输出匹配)
|
|
|
|
|
// 如果播放速度和音调异常,请修改采样率
|
|
|
|
|
// 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
|
|
|
|
|
uint32_t wavSampleRate
|
|
|
|
|
) override {
|
|
|
|
|
uint32_t wavSampleRate) override {
|
|
|
|
|
// Release model if exists
|
|
|
|
|
ReleaseModel();
|
|
|
|
|
|
|
|
|
|
acoustic_model_predictor_ = LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
|
|
|
|
|
acoustic_model_predictor_ =
|
|
|
|
|
LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
|
|
|
|
|
if (acoustic_model_predictor_ == nullptr) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
@ -80,7 +96,10 @@ public:
|
|
|
|
|
ReleaseWav();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override {
|
|
|
|
|
virtual std::shared_ptr<PaddlePredictor> LoadModel(
|
|
|
|
|
const std::string &modelPath,
|
|
|
|
|
int cpuThreadNum,
|
|
|
|
|
PowerMode cpuPowerMode) override {
|
|
|
|
|
if (modelPath.empty()) {
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
@ -120,7 +139,8 @@ public:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override {
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(
|
|
|
|
|
const std::vector<int64_t> &phones) override {
|
|
|
|
|
auto phones_handle = acoustic_model_predictor_->GetInput(0);
|
|
|
|
|
phones_handle->Resize({static_cast<int64_t>(phones.size())});
|
|
|
|
|
phones_handle->CopyFromCpu(phones.data());
|
|
|
|
@ -139,7 +159,8 @@ public:
|
|
|
|
|
return am_output_handle;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override {
|
|
|
|
|
virtual std::unique_ptr<const Tensor> GetVocoderOutput(
|
|
|
|
|
std::unique_ptr<const Tensor> &&amOutput) override {
|
|
|
|
|
auto mel_handle = vocoder_predictor_->GetInput(0);
|
|
|
|
|
// [?, 80]
|
|
|
|
|
auto dims = amOutput->shape();
|
|
|
|
@ -161,7 +182,8 @@ public:
|
|
|
|
|
return voc_output_handle;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override {
|
|
|
|
|
virtual void VocoderOutputToWav(
|
|
|
|
|
std::unique_ptr<const Tensor> &&vocOutput) override {
|
|
|
|
|
// 获取输出Tensor的数据
|
|
|
|
|
int64_t output_size = 1;
|
|
|
|
|
for (auto dim : vocOutput->shape()) {
|
|
|
|
@ -175,16 +197,13 @@ public:
|
|
|
|
|
virtual void SaveFloatWav(float *floatWav, int64_t size) override;
|
|
|
|
|
|
|
|
|
|
virtual bool IsLoaded() override {
|
|
|
|
|
return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr;
|
|
|
|
|
return acoustic_model_predictor_ != nullptr &&
|
|
|
|
|
vocoder_predictor_ != nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual float GetInferenceTime() override {
|
|
|
|
|
return inference_time_;
|
|
|
|
|
}
|
|
|
|
|
virtual float GetInferenceTime() override { return inference_time_; }
|
|
|
|
|
|
|
|
|
|
const std::vector<WavDataType> & GetWav() {
|
|
|
|
|
return wav_;
|
|
|
|
|
}
|
|
|
|
|
const std::vector<WavDataType> &GetWav() { return wav_; }
|
|
|
|
|
|
|
|
|
|
virtual int GetWavSize() override {
|
|
|
|
|
return wav_.size() * sizeof(WavDataType);
|
|
|
|
@ -192,7 +211,8 @@ public:
|
|
|
|
|
|
|
|
|
|
// 获取WAV持续时间(单位:毫秒)
|
|
|
|
|
virtual float GetWavDuration() override {
|
|
|
|
|
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000;
|
|
|
|
|
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) /
|
|
|
|
|
static_cast<float>(wav_sample_rate_) * 1000;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 获取RTF(合成时间 / 音频时长)
|
|
|
|
@ -200,9 +220,7 @@ public:
|
|
|
|
|
return GetInferenceTime() / GetWavDuration();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual void ReleaseWav() override {
|
|
|
|
|
wav_.clear();
|
|
|
|
|
}
|
|
|
|
|
virtual void ReleaseWav() override { wav_.clear(); }
|
|
|
|
|
|
|
|
|
|
virtual bool WriteWavToFile(const std::string &wavPath) override {
|
|
|
|
|
std::ofstream fout(wavPath, std::ios::binary);
|
|
|
|
@ -216,12 +234,14 @@ public:
|
|
|
|
|
header.data_size = GetWavSize();
|
|
|
|
|
header.size = sizeof(header) - 8 + header.data_size;
|
|
|
|
|
header.sample_rate = wav_sample_rate_;
|
|
|
|
|
header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8;
|
|
|
|
|
header.byte_rate = header.sample_rate * header.num_channels *
|
|
|
|
|
header.bits_per_sample / 8;
|
|
|
|
|
header.block_align = header.num_channels * header.bits_per_sample / 8;
|
|
|
|
|
fout.write(reinterpret_cast<const char *>(&header), sizeof(header));
|
|
|
|
|
|
|
|
|
|
// 写入wav数据
|
|
|
|
|
fout.write(reinterpret_cast<const char*>(wav_.data()), header.data_size);
|
|
|
|
|
fout.write(reinterpret_cast<const char *>(wav_.data()),
|
|
|
|
|
header.data_size);
|
|
|
|
|
|
|
|
|
|
fout.close();
|
|
|
|
|
return true;
|
|
|
|
@ -258,9 +278,7 @@ protected:
|
|
|
|
|
// 返回值通过模板特化由 WavDataType 决定
|
|
|
|
|
inline uint16_t GetWavAudioFormat();
|
|
|
|
|
|
|
|
|
|
inline float Abs(float number) {
|
|
|
|
|
return (number < 0) ? -number : number;
|
|
|
|
|
}
|
|
|
|
|
inline float Abs(float number) { return (number < 0) ? -number : number; }
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
float inference_time_ = 0;
|
|
|
|
|