diff --git a/demos/TTSArmLinux/src/Predictor.hpp b/demos/TTSArmLinux/src/Predictor.hpp index 985d01158..64d459ddb 100644 --- a/demos/TTSArmLinux/src/Predictor.hpp +++ b/demos/TTSArmLinux/src/Predictor.hpp @@ -1,7 +1,20 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include #include -#include #include +#include #include #include #include @@ -10,24 +23,28 @@ using namespace paddle::lite_api; class PredictorInterface { -public: + public: virtual ~PredictorInterface() = 0; - virtual bool Init( - const std::string &AcousticModelPath, - const std::string &VocoderPath, - PowerMode cpuPowerMode, - int cpuThreadNum, - // WAV采样率(必须与模型输出匹配) - // 如果播放速度和音调异常,请修改采样率 - // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 - uint32_t wavSampleRate - ) = 0; - virtual std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0; + virtual bool Init(const std::string &AcousticModelPath, + const std::string &VocoderPath, + PowerMode cpuPowerMode, + int cpuThreadNum, + // WAV采样率(必须与模型输出匹配) + // 如果播放速度和音调异常,请修改采样率 + // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 + uint32_t wavSampleRate) = 0; + virtual std::shared_ptr LoadModel( + const std::string &modelPath, + int cpuThreadNum, + PowerMode cpuPowerMode) = 0; virtual void ReleaseModel() = 0; virtual bool RunModel(const std::vector &phones) = 0; - virtual std::unique_ptr GetAcousticModelOutput(const std::vector &phones) = 0; - virtual std::unique_ptr GetVocoderOutput(std::unique_ptr &&amOutput) = 0; - virtual void VocoderOutputToWav(std::unique_ptr &&vocOutput) = 0; + virtual std::unique_ptr GetAcousticModelOutput( + const std::vector &phones) = 0; + virtual std::unique_ptr GetVocoderOutput( + std::unique_ptr &&amOutput) = 0; + virtual void VocoderOutputToWav( + std::unique_ptr &&vocOutput) = 0; virtual void SaveFloatWav(float *floatWav, int64_t size) = 0; virtual bool IsLoaded() = 0; virtual float GetInferenceTime() = 0; @@ -45,23 +62,22 @@ PredictorInterface::~PredictorInterface() {} // WavDataType: WAV数据类型 // 可在 int16_t 和 float 之间切换, // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV -template +template class Predictor : public PredictorInterface { -public: - virtual bool Init( - const std::string &AcousticModelPath, - const std::string &VocoderPath, - PowerMode cpuPowerMode, - int cpuThreadNum, - // WAV采样率(必须与模型输出匹配) - // 如果播放速度和音调异常,请修改采样率 - // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 - uint32_t wavSampleRate - ) override { + public: + virtual bool Init(const std::string &AcousticModelPath, + const std::string &VocoderPath, + PowerMode cpuPowerMode, + int cpuThreadNum, + // WAV采样率(必须与模型输出匹配) + // 如果播放速度和音调异常,请修改采样率 + // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 + uint32_t wavSampleRate) override { // Release model if exists ReleaseModel(); - acoustic_model_predictor_ = LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode); + acoustic_model_predictor_ = + LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode); if (acoustic_model_predictor_ == nullptr) { return false; } @@ -80,7 +96,10 @@ public: ReleaseWav(); } - virtual std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override { + virtual std::shared_ptr LoadModel( + const std::string &modelPath, + int cpuThreadNum, + PowerMode cpuPowerMode) override { if (modelPath.empty()) { return nullptr; } @@ -115,12 +134,13 @@ public: // 计算用时 std::chrono::duration duration = end - start; - inference_time_ = duration.count() * 1000; // 单位:毫秒 + inference_time_ = duration.count() * 1000; // 单位:毫秒 return true; } - virtual std::unique_ptr GetAcousticModelOutput(const std::vector &phones) override { + virtual std::unique_ptr GetAcousticModelOutput( + const std::vector &phones) override { auto phones_handle = acoustic_model_predictor_->GetInput(0); phones_handle->Resize({static_cast(phones.size())}); phones_handle->CopyFromCpu(phones.data()); @@ -139,7 +159,8 @@ public: return am_output_handle; } - virtual std::unique_ptr GetVocoderOutput(std::unique_ptr &&amOutput) override { + virtual std::unique_ptr GetVocoderOutput( + std::unique_ptr &&amOutput) override { auto mel_handle = vocoder_predictor_->GetInput(0); // [?, 80] auto dims = amOutput->shape(); @@ -161,7 +182,8 @@ public: return voc_output_handle; } - virtual void VocoderOutputToWav(std::unique_ptr &&vocOutput) override { + virtual void VocoderOutputToWav( + std::unique_ptr &&vocOutput) override { // 获取输出Tensor的数据 int64_t output_size = 1; for (auto dim : vocOutput->shape()) { @@ -175,16 +197,13 @@ public: virtual void SaveFloatWav(float *floatWav, int64_t size) override; virtual bool IsLoaded() override { - return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr; + return acoustic_model_predictor_ != nullptr && + vocoder_predictor_ != nullptr; } - virtual float GetInferenceTime() override { - return inference_time_; - } + virtual float GetInferenceTime() override { return inference_time_; } - const std::vector & GetWav() { - return wav_; - } + const std::vector &GetWav() { return wav_; } virtual int GetWavSize() override { return wav_.size() * sizeof(WavDataType); @@ -192,7 +211,8 @@ public: // 获取WAV持续时间(单位:毫秒) virtual float GetWavDuration() override { - return static_cast(GetWavSize()) / sizeof(WavDataType) / static_cast(wav_sample_rate_) * 1000; + return static_cast(GetWavSize()) / sizeof(WavDataType) / + static_cast(wav_sample_rate_) * 1000; } // 获取RTF(合成时间 / 音频时长) @@ -200,9 +220,7 @@ public: return GetInferenceTime() / GetWavDuration(); } - virtual void ReleaseWav() override { - wav_.clear(); - } + virtual void ReleaseWav() override { wav_.clear(); } virtual bool WriteWavToFile(const std::string &wavPath) override { std::ofstream fout(wavPath, std::ios::binary); @@ -216,18 +234,20 @@ public: header.data_size = GetWavSize(); header.size = sizeof(header) - 8 + header.data_size; header.sample_rate = wav_sample_rate_; - header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8; + header.byte_rate = header.sample_rate * header.num_channels * + header.bits_per_sample / 8; header.block_align = header.num_channels * header.bits_per_sample / 8; - fout.write(reinterpret_cast(&header), sizeof(header)); + fout.write(reinterpret_cast(&header), sizeof(header)); // 写入wav数据 - fout.write(reinterpret_cast(wav_.data()), header.data_size); + fout.write(reinterpret_cast(wav_.data()), + header.data_size); fout.close(); return true; } -protected: + protected: struct WavHeader { // RIFF 头 char riff[4] = {'R', 'I', 'F', 'F'}; @@ -250,19 +270,17 @@ protected: }; enum WavAudioFormat { - WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式 + WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式 WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式 }; -protected: + protected: // 返回值通过模板特化由 WavDataType 决定 inline uint16_t GetWavAudioFormat(); - inline float Abs(float number) { - return (number < 0) ? -number : number; - } + inline float Abs(float number) { return (number < 0) ? -number : number; } -protected: + protected: float inference_time_ = 0; uint32_t wav_sample_rate_ = 0; std::vector wav_; @@ -270,36 +288,36 @@ protected: std::shared_ptr vocoder_predictor_ = nullptr; }; -template<> +template <> uint16_t Predictor::GetWavAudioFormat() { return Predictor::WAV_FORMAT_16BIT_PCM; } -template<> +template <> uint16_t Predictor::GetWavAudioFormat() { return Predictor::WAV_FORMAT_32BIT_FLOAT; } // 保存 16-bit PCM 格式 WAV -template<> +template <> void Predictor::SaveFloatWav(float *floatWav, int64_t size) { wav_.resize(size); float maxSample = 0.01; // 寻找最大采样值 - for (int64_t i=0; i maxSample) { maxSample = sample; } } // 把采样值缩放到 int_16 范围 - for (int64_t i=0; i +template <> void Predictor::SaveFloatWav(float *floatWav, int64_t size) { wav_.resize(size); std::copy_n(floatWav, size, wav_.data()); diff --git a/demos/TTSArmLinux/src/main.cc b/demos/TTSArmLinux/src/main.cc index f3bd0f7b0..0b8e26bc4 100644 --- a/demos/TTSArmLinux/src/main.cc +++ b/demos/TTSArmLinux/src/main.cc @@ -1,23 +1,48 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include #include #include +#include #include #include -#include -#include -#include -#include -#include #include "Predictor.hpp" using namespace paddle::lite_api; -DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)"); +DEFINE_string( + sentence, + "你好,欢迎使用语音合成服务", + "Text to be synthesized (Chinese only. English will crash the program.)"); DEFINE_string(front_conf, "./front.conf", "Front configuration file"); -DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file"); -DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file"); +DEFINE_string(acoustic_model, + "./models/cpu/fastspeech2_csmsc_arm.nb", + "Acoustic model .nb file"); +DEFINE_string(vocoder, + "./models/cpu/fastspeech2_csmsc_arm.nb", + "vocoder .nb file"); DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file"); -DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)"); -DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder"); +DEFINE_string(wav_bit_depth, + "16", + "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)"); +DEFINE_string(wav_sample_rate, + "24000", + "WAV sample rate, should match the output of the vocoder"); DEFINE_string(cpu_thread, "1", "CPU thread numbers"); int main(int argc, char *argv[]) { @@ -53,7 +78,7 @@ int main(int argc, char *argv[]) { // 繁体转简体 std::wstring sentence_simp; - front_inst->Trand2Simp(ws_sentence, sentence_simp); + front_inst->Trand2Simp(ws_sentence, &sentence_simp); ws_sentence = sentence_simp; std::string s_sentence; @@ -63,28 +88,30 @@ int main(int argc, char *argv[]) { // 根据标点进行分句 LOG(INFO) << "Start to segment sentences by punctuation"; - front_inst->SplitByPunc(ws_sentence, sentence_part); + front_inst->SplitByPunc(ws_sentence, &sentence_part); LOG(INFO) << "Segment sentences through punctuation successfully"; // 分句后获取音素id - LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence"; - for(int i = 0; i < sentence_part.size(); i++) { - - LOG(INFO) << "Raw sentence is: " << ppspeech::wstring2utf8string(sentence_part[i]); - front_inst->SentenceNormalize(sentence_part[i]); + LOG(INFO) + << "Start to get the phoneme and tone id sequence of each sentence"; + for (int i = 0; i < sentence_part.size(); i++) { + LOG(INFO) << "Raw sentence is: " + << ppspeech::wstring2utf8string(sentence_part[i]); + front_inst->SentenceNormalize(&sentence_part[i]); s_sentence = ppspeech::wstring2utf8string(sentence_part[i]); LOG(INFO) << "After normalization sentence is: " << s_sentence; - - if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) { + + if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) { LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; return -1; } - } - LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " "); - LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " "); + LOG(INFO) << "The phoneids of the sentence is: " + << limonp::Join(phoneids.begin(), phoneids.end(), " "); + LOG(INFO) << "The toneids of the sentence is: " + << limonp::Join(toneids.begin(), toneids.end(), " "); LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; - + /////////////////////////// 后端:音素转音频 /////////////////////////// @@ -99,13 +126,19 @@ int main(int argc, char *argv[]) { // CPU电源模式 const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH; - if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) { + if (!predictor->Init(FLAGS_acoustic_model, + FLAGS_vocoder, + cpuPowerMode, + cpuThreadNum, + wavSampleRate)) { LOG(ERROR) << "predictor init failed" << std::endl; return -1; } std::vector phones(phoneids.size()); - std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast(x); }); + std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { + return static_cast(x); + }); if (!predictor->RunModel(phones)) { LOG(ERROR) << "predictor run model failed" << std::endl; @@ -113,7 +146,8 @@ int main(int argc, char *argv[]) { } LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, " - << "WAV size (without header): " << predictor->GetWavSize() << " bytes, " + << "WAV size (without header): " << predictor->GetWavSize() + << " bytes, " << "WAV duration: " << predictor->GetWavDuration() << " ms, " << "RTF: " << predictor->GetRTF() << std::endl;