diff --git a/demos/TTSArmLinux/.gitignore b/demos/TTSArmLinux/.gitignore index 13135e376..f18480d7a 100644 --- a/demos/TTSArmLinux/.gitignore +++ b/demos/TTSArmLinux/.gitignore @@ -1,4 +1,8 @@ +# 目录 build/ output/ libs/ models/ + +# 符号连接 +dict diff --git a/demos/TTSArmLinux/README.md b/demos/TTSArmLinux/README.md index 32b85e0a4..1edd6589f 100644 --- a/demos/TTSArmLinux/README.md +++ b/demos/TTSArmLinux/README.md @@ -45,10 +45,14 @@ cd PaddleSpeech/demos/TTSArmLinux ``` ./run.sh +./run.sh --sentence "语音合成测试" +./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav +./run.sh --help ``` -将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件,保存在 `output` 文件夹中。 +目前只支持中文合成,出现任何英文都会导致程序崩溃。 +如果未指定`--wav_file`,默认输出到`./output/tts.wav`。 ## 手动编译 Paddle Lite 库 diff --git a/demos/TTSArmLinux/build-depends.sh b/demos/TTSArmLinux/build-depends.sh new file mode 120000 index 000000000..fd3aec9c8 --- /dev/null +++ b/demos/TTSArmLinux/build-depends.sh @@ -0,0 +1 @@ +src/TTSCppFrontend/build-depends.sh \ No newline at end of file diff --git a/demos/TTSArmLinux/build.sh b/demos/TTSArmLinux/build.sh index c872e5749..5d31173ef 100755 --- a/demos/TTSArmLinux/build.sh +++ b/demos/TTSArmLinux/build.sh @@ -1,8 +1,11 @@ #!/bin/bash set -e +set -x cd "$(dirname "$(realpath "$0")")" +BASE_DIR="$PWD" + # load configure . ./config.sh @@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")" echo "ARM_ABI is ${ARM_ABI}" echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}" -rm -rf build -mkdir -p build -cd build +echo "Build depends..." +./build-depends.sh "$@" +mkdir -p "$BASE_DIR/build" +cd "$BASE_DIR/build" cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src -make + +if [ "$*" = "" ]; then + make -j$(nproc) +else + make "$@" +fi echo "make successful!" diff --git a/demos/TTSArmLinux/clean.sh b/demos/TTSArmLinux/clean.sh index 1ea365566..2743801c3 100755 --- a/demos/TTSArmLinux/clean.sh +++ b/demos/TTSArmLinux/clean.sh @@ -1,8 +1,11 @@ #!/bin/bash set -e +set -x cd "$(dirname "$(realpath "$0")")" +BASE_DIR="$PWD" + # load configure . ./config.sh @@ -12,3 +15,9 @@ set -x rm -rf "$OUTPUT_DIR" rm -rf "$LIBS_DIR" rm -rf "$MODELS_DIR" +rm -rf "$BASE_DIR/build" + +"$BASE_DIR/src/TTSCppFrontend/clean.sh" + +# 符号连接 +rm "$BASE_DIR/dict" diff --git a/demos/TTSArmLinux/config.sh b/demos/TTSArmLinux/config.sh index 19d53781a..bf38d7d6d 100644 --- a/demos/TTSArmLinux/config.sh +++ b/demos/TTSArmLinux/config.sh @@ -12,3 +12,4 @@ PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_ext ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb" VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb" +FRONT_CONF="${PWD}/front.conf" diff --git a/demos/TTSArmLinux/download.sh b/demos/TTSArmLinux/download.sh index 560374bc9..7eaa836a5 100755 --- a/demos/TTSArmLinux/download.sh +++ b/demos/TTSArmLinux/download.sh @@ -3,6 +3,8 @@ set -e cd "$(dirname "$(realpath "$0")")" +BASE_DIR="$PWD" + # load configure . ./config.sh @@ -38,6 +40,10 @@ download() { echo '=======================' } +######################################## + +echo "Download models..." + download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ 'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ '39e0c6604f97c70f5d13c573d7e709b9' \ @@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \ "$MODELS_DIR" echo "Done." + +######################################## + +echo "Download dictionary files..." + +ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/" + +"$BASE_DIR/src/TTSCppFrontend/download.sh" diff --git a/demos/TTSArmLinux/front.conf b/demos/TTSArmLinux/front.conf new file mode 100644 index 000000000..04bd2d97f --- /dev/null +++ b/demos/TTSArmLinux/front.conf @@ -0,0 +1,21 @@ +# jieba conf +--jieba_dict_path=./dict/jieba/jieba.dict.utf8 +--jieba_hmm_path=./dict/jieba/hmm_model.utf8 +--jieba_user_dict_path=./dict/jieba/user.dict.utf8 +--jieba_idf_path=./dict/jieba/idf.utf8 +--jieba_stop_word_path=./dict/jieba/stop_words.utf8 + +# dict conf fastspeech2_0.4 +--seperate_tone=false +--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict +--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict + +# dict conf speedyspeech_0.5 +#--seperate_tone=true +#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict +#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt +#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt + +# dict of tranditional_to_simplified +--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt diff --git a/demos/TTSArmLinux/run.sh b/demos/TTSArmLinux/run.sh index 2adcc1b56..d0860f044 100755 --- a/demos/TTSArmLinux/run.sh +++ b/demos/TTSArmLinux/run.sh @@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")" . ./config.sh # create dir -rm -rf "$OUTPUT_DIR" mkdir -p "$OUTPUT_DIR" # run -for i in {1..10}; do - (set -x; ./build/paddlespeech_tts_demo "$ACOUSTIC_MODEL_PATH" "$VOCODER_PATH" $i "$OUTPUT_DIR/$i.wav") -done - -ls -lh "$OUTPUT_DIR"/*.wav +set -x +./build/paddlespeech_tts_demo \ + --front_conf "$FRONT_CONF" \ + --acoustic_model "$ACOUSTIC_MODEL_PATH" \ + --vocoder "$VOCODER_PATH" \ + "$@" +# end diff --git a/demos/TTSArmLinux/src/CMakeLists.txt b/demos/TTSArmLinux/src/CMakeLists.txt index e1076af92..15a74a0a8 100644 --- a/demos/TTSArmLinux/src/CMakeLists.txt +++ b/demos/TTSArmLinux/src/CMakeLists.txt @@ -1,4 +1,18 @@ cmake_minimum_required(VERSION 3.10) +project(paddlespeech_tts_demo) + + +########## Global Options ########## + +option(WITH_FRONT_DEMO "Build front demo" OFF) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(ABSL_PROPAGATE_CXX_STD ON) + + +########## ARM Options ########## + set(CMAKE_SYSTEM_NAME Linux) if(ARM_ABI STREQUAL "armv8") set(CMAKE_SYSTEM_PROCESSOR aarch64) @@ -13,7 +27,9 @@ else() return() endif() -project(paddlespeech_tts_demo) + +########## Paddle Lite Options ########## + message(STATUS "TARGET ARCH ABI: ${ARM_ABI}") message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}") @@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf") set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" ) endif() + +########## Dependencies ########## + find_package(OpenMP REQUIRED) if(OpenMP_FOUND OR OpenMP_CXX_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") @@ -43,5 +62,19 @@ else() return() endif() + +############### tts cpp frontend ############### + +add_subdirectory(TTSCppFrontend) + +include_directories( + TTSCppFrontend/src + third-party/build/src/cppjieba/include + third-party/build/src/limonp/include +) + + +############### paddlespeech_tts_demo ############### + add_executable(paddlespeech_tts_demo main.cc) -target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared) +target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front) diff --git a/demos/TTSArmLinux/src/Predictor.hpp b/demos/TTSArmLinux/src/Predictor.hpp index 0878c9d65..a1eaad990 100644 --- a/demos/TTSArmLinux/src/Predictor.hpp +++ b/demos/TTSArmLinux/src/Predictor.hpp @@ -9,42 +9,43 @@ using namespace paddle::lite_api; +class PredictorInterface { +public: + virtual bool Init( + const std::string &AcousticModelPath, + const std::string &VocoderPath, + PowerMode cpuPowerMode, + int cpuThreadNum, + // WAV采样率(必须与模型输出匹配) + // 如果播放速度和音调异常,请修改采样率 + // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 + uint32_t wavSampleRate + ) = 0; + virtual std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0; + virtual void ReleaseModel() = 0; + virtual bool RunModel(const std::vector &phones) = 0; + virtual std::unique_ptr GetAcousticModelOutput(const std::vector &phones) = 0; + virtual std::unique_ptr GetVocoderOutput(std::unique_ptr &&amOutput) = 0; + virtual void VocoderOutputToWav(std::unique_ptr &&vocOutput) = 0; + virtual void SaveFloatWav(float *floatWav, int64_t size) = 0; + virtual bool IsLoaded() = 0; + virtual float GetInferenceTime() = 0; + virtual int GetWavSize() = 0; + // 获取WAV持续时间(单位:毫秒) + virtual float GetWavDuration() = 0; + // 获取RTF(合成时间 / 音频时长) + virtual float GetRTF() = 0; + virtual void ReleaseWav() = 0; + virtual bool WriteWavToFile(const std::string &wavPath) = 0; +}; + // WavDataType: WAV数据类型 // 可在 int16_t 和 float 之间切换, // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV template -class Predictor { +class Predictor : public PredictorInterface { public: - struct WavHeader { - // RIFF 头 - char riff[4] = {'R', 'I', 'F', 'F'}; - uint32_t size = 0; - char wave[4] = {'W', 'A', 'V', 'E'}; - - // FMT 头 - char fmt[4] = {'f', 'm', 't', ' '}; - uint32_t fmt_size = 16; - uint16_t audio_format = 0; - uint16_t num_channels = 1; - uint32_t sample_rate = 0; - uint32_t byte_rate = 0; - uint16_t block_align = 0; - uint16_t bits_per_sample = sizeof(WavDataType) * 8; - - // DATA 头 - char data[4] = {'d', 'a', 't', 'a'}; - uint32_t data_size = 0; - }; - - enum WavAudioFormat { - WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式 - WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式 - }; - - // 返回值通过模板特化由 WavDataType 决定 - inline uint16_t GetWavAudioFormat(); - - bool Init( + virtual bool Init( const std::string &AcousticModelPath, const std::string &VocoderPath, PowerMode cpuPowerMode, @@ -53,7 +54,7 @@ public: // 如果播放速度和音调异常,请修改采样率 // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 uint32_t wavSampleRate - ) { + ) override { // Release model if exists ReleaseModel(); @@ -71,12 +72,12 @@ public: return true; } - ~Predictor() { + virtual ~Predictor() { ReleaseModel(); ReleaseWav(); } - std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) { + virtual std::shared_ptr LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override { if (modelPath.empty()) { return nullptr; } @@ -90,12 +91,12 @@ public: return CreatePaddlePredictor(config); } - void ReleaseModel() { + virtual void ReleaseModel() override { acoustic_model_predictor_ = nullptr; vocoder_predictor_ = nullptr; } - bool RunModel(const std::vector &phones) { + virtual bool RunModel(const std::vector &phones) override { if (!IsLoaded()) { return false; } @@ -116,7 +117,7 @@ public: return true; } - std::unique_ptr GetAcousticModelOutput(const std::vector &phones) { + virtual std::unique_ptr GetAcousticModelOutput(const std::vector &phones) override { auto phones_handle = acoustic_model_predictor_->GetInput(0); phones_handle->Resize({static_cast(phones.size())}); phones_handle->CopyFromCpu(phones.data()); @@ -135,7 +136,7 @@ public: return am_output_handle; } - std::unique_ptr GetVocoderOutput(std::unique_ptr &&amOutput) { + virtual std::unique_ptr GetVocoderOutput(std::unique_ptr &&amOutput) override { auto mel_handle = vocoder_predictor_->GetInput(0); // [?, 80] auto dims = amOutput->shape(); @@ -157,7 +158,7 @@ public: return voc_output_handle; } - void VocoderOutputToWav(std::unique_ptr &&vocOutput) { + virtual void VocoderOutputToWav(std::unique_ptr &&vocOutput) override { // 获取输出Tensor的数据 int64_t output_size = 1; for (auto dim : vocOutput->shape()) { @@ -168,17 +169,13 @@ public: SaveFloatWav(output_data, output_size); } - inline float Abs(float number) { - return (number < 0) ? -number : number; - } - - void SaveFloatWav(float *floatWav, int64_t size); + virtual void SaveFloatWav(float *floatWav, int64_t size) override; - bool IsLoaded() { + virtual bool IsLoaded() override { return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr; } - float GetInferenceTime() { + virtual float GetInferenceTime() override { return inference_time_; } @@ -186,25 +183,25 @@ public: return wav_; } - int GetWavSize() { + virtual int GetWavSize() override { return wav_.size() * sizeof(WavDataType); } // 获取WAV持续时间(单位:毫秒) - float GetWavDuration() { + virtual float GetWavDuration() override { return static_cast(GetWavSize()) / sizeof(WavDataType) / static_cast(wav_sample_rate_) * 1000; } // 获取RTF(合成时间 / 音频时长) - float GetRTF() { + virtual float GetRTF() override { return GetInferenceTime() / GetWavDuration(); } - void ReleaseWav() { + virtual void ReleaseWav() override { wav_.clear(); } - bool WriteWavToFile(const std::string &wavPath) { + virtual bool WriteWavToFile(const std::string &wavPath) override { std::ofstream fout(wavPath, std::ios::binary); if (!fout.is_open()) { return false; @@ -227,7 +224,42 @@ public: return true; } -private: +protected: + struct WavHeader { + // RIFF 头 + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t size = 0; + char wave[4] = {'W', 'A', 'V', 'E'}; + + // FMT 头 + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_size = 16; + uint16_t audio_format = 0; + uint16_t num_channels = 1; + uint32_t sample_rate = 0; + uint32_t byte_rate = 0; + uint16_t block_align = 0; + uint16_t bits_per_sample = sizeof(WavDataType) * 8; + + // DATA 头 + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size = 0; + }; + + enum WavAudioFormat { + WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式 + WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式 + }; + +protected: + // 返回值通过模板特化由 WavDataType 决定 + inline uint16_t GetWavAudioFormat(); + + inline float Abs(float number) { + return (number < 0) ? -number : number; + } + +protected: float inference_time_ = 0; uint32_t wav_sample_rate_ = 0; std::vector wav_; diff --git a/demos/TTSArmLinux/src/TTSCppFrontend b/demos/TTSArmLinux/src/TTSCppFrontend new file mode 120000 index 000000000..25953976d --- /dev/null +++ b/demos/TTSArmLinux/src/TTSCppFrontend @@ -0,0 +1 @@ +../../TTSCppFrontend/ \ No newline at end of file diff --git a/demos/TTSArmLinux/src/main.cc b/demos/TTSArmLinux/src/main.cc index 2285b28b3..5215b3bf5 100644 --- a/demos/TTSArmLinux/src/main.cc +++ b/demos/TTSArmLinux/src/main.cc @@ -1,90 +1,128 @@ #include #include #include -#include "paddle_api.h" +#include +#include +#include +#include +#include +#include #include "Predictor.hpp" using namespace paddle::lite_api; -std::vector> sentencesToChoose = { - // 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。 - {261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141}, - // 009902 钱伟长想到上海来办学校是经过深思熟虑的。 - {174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45}, - // 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 - {182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168}, - // 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。 - {153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45}, - // 009905 这种车票和保险单捆绑出售属于重复性购买。 - {262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9}, - // 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 - {40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120}, - // 009907 观大势、谋大局、出大策始终是该院的办院方针。 - {70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51}, - // 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 - {182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20}, - // 009909 但是因为还没到退休年龄,只能掰着指头捱日子。 - {40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112}, - // 009910 这几天雨水不断,人们恨不得待在家里不出门。 - {262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52}, -}; - -void usage(const char *binName) { - std::cerr << "Usage:" << std::endl - << "\t" << binName << " " << std::endl; -} +DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)"); +DEFINE_string(front_conf, "./front.conf", "Front configuration file"); +DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file"); +DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file"); +DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file"); +DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)"); +DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder"); +DEFINE_string(cpu_thread, "1", "CPU thread numbers"); int main(int argc, char *argv[]) { - if (argc < 5) { - usage(argv[0]); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + PredictorInterface *predictor; + + if (FLAGS_wav_bit_depth == "16") { + predictor = new Predictor(); + } else if (FLAGS_wav_bit_depth == "32") { + predictor = new Predictor(); + } else { + LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth; return -1; } - const char *AMModelPath = argv[1]; - const char *VOCModelPath = argv[2]; - int sentencesIndex = atoi(argv[3]) - 1; - const char *outputWavPath = argv[4]; - if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) { - std::cerr << "sentences-index out of range" << std::endl; + + /////////////////////////// 前端:文本转音素 /////////////////////////// + + // 实例化文本前端引擎 + speechnn::FrontEngineInterface *front_inst = nullptr; + front_inst = new speechnn::FrontEngineInterface(FLAGS_front_conf); + if ((!front_inst) || (front_inst->init())) { + LOG(ERROR) << "Creater tts engine failed!"; + if (front_inst != nullptr) { + delete front_inst; + } + front_inst = nullptr; return -1; } - // 模板参数:WAV数据类型 - // 可在 int16_t 和 float 之间切换, - // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV - Predictor predictor; - //Predictor predictor; + std::wstring ws_sentence = speechnn::utf8string2wstring(FLAGS_sentence); + + // 繁体转简体 + std::wstring sentence_simp; + front_inst->Trand2Simp(ws_sentence, sentence_simp); + ws_sentence = sentence_simp; + + std::string s_sentence; + std::vector sentence_part; + std::vector phoneids = {}; + std::vector toneids = {}; + + // 根据标点进行分句 + LOG(INFO) << "Start to segment sentences by punctuation"; + front_inst->SplitByPunc(ws_sentence, sentence_part); + LOG(INFO) << "Segment sentences through punctuation successfully"; + + // 分句后获取音素id + LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence"; + for(int i = 0; i < sentence_part.size(); i++) { + + LOG(INFO) << "Raw sentence is: " << speechnn::wstring2utf8string(sentence_part[i]); + front_inst->SentenceNormalize(sentence_part[i]); + s_sentence = speechnn::wstring2utf8string(sentence_part[i]); + LOG(INFO) << "After normalization sentence is: " << s_sentence; + + if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) { + LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; + return -1; + } + + } + LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " "); + LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " "); + LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; + + + /////////////////////////// 后端:音素转音频 /////////////////////////// // WAV采样率(必须与模型输出匹配) // 如果播放速度和音调异常,请修改采样率 // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 - const uint32_t wavSampleRate = 24000; + const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate); // CPU线程数 - const int cpuThreadNum = 1; + const int cpuThreadNum = std::stol(FLAGS_cpu_thread); // CPU电源模式 const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH; - if (!predictor.Init(AMModelPath, VOCModelPath, cpuPowerMode, cpuThreadNum, wavSampleRate)) { - std::cerr << "predictor init failed" << std::endl; + if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) { + LOG(ERROR) << "predictor init failed" << std::endl; return -1; } - if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) { - std::cerr << "predictor run model failed" << std::endl; + std::vector phones(phoneids.size()); + std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast(x); }); + + if (!predictor->RunModel(phones)) { + LOG(ERROR) << "predictor run model failed" << std::endl; return -1; } - std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, " - << "WAV size (without header): " << predictor.GetWavSize() << " bytes, " - << "WAV duration: " << predictor.GetWavDuration() << " ms, " - << "RTF: " << predictor.GetRTF() << std::endl; + LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, " + << "WAV size (without header): " << predictor->GetWavSize() << " bytes, " + << "WAV duration: " << predictor->GetWavDuration() << " ms, " + << "RTF: " << predictor->GetRTF() << std::endl; - if (!predictor.WriteWavToFile(outputWavPath)) { - std::cerr << "write wav file failed" << std::endl; + if (!predictor->WriteWavToFile(FLAGS_output_wav)) { + LOG(ERROR) << "write wav file failed" << std::endl; return -1; } + delete predictor; + return 0; } diff --git a/demos/TTSArmLinux/src/third-party b/demos/TTSArmLinux/src/third-party new file mode 120000 index 000000000..851b2c1ec --- /dev/null +++ b/demos/TTSArmLinux/src/third-party @@ -0,0 +1 @@ +TTSCppFrontend/third-party \ No newline at end of file