demos/TTSArmLinux: Use TTSCppFrontend as the front end to support the synthesis of any Chinese sentences

3 years ago · aaca18216e
parent bc739c7b78
commit aaca18216e
14 changed files with 287 additions and 118 deletions
--- a/demos/TTSArmLinux/.gitignore
+++ b/demos/TTSArmLinux/.gitignore
@ -1,4 +1,8 @@
 # 目录
 build/
 output/
 libs/
 models/
 # 符号连接
 dict
--- a/demos/TTSArmLinux/README.md
+++ b/demos/TTSArmLinux/README.md
@ -45,10 +45,14 @@ cd PaddleSpeech/demos/TTSArmLinux
 ```
 ./run.sh
 ./run.sh --sentence "语音合成测试"
 ./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
 ./run.sh --help
 ```
-将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件，保存在 `output` 文件夹中。
+目前只支持中文合成，出现任何英文都会导致程序崩溃。
 如果未指定`--wav_file`，默认输出到`./output/tts.wav`。
 ## 手动编译 Paddle Lite 库
--- a/demos/TTSArmLinux/build-depends.sh
+++ b/demos/TTSArmLinux/build-depends.sh
@ -0,0 +1 @@
 src/TTSCppFrontend/build-depends.sh
--- a/demos/TTSArmLinux/build.sh
+++ b/demos/TTSArmLinux/build.sh
@ -1,8 +1,11 @@
 #!/bin/bash
 set -e
 set -x
 cd "$(dirname "$(realpath "$0")")"
 BASE_DIR="$PWD"
 # load configure
 . ./config.sh
@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")"
 echo "ARM_ABI is ${ARM_ABI}"
 echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"
-rm -rf build
+echo "Build depends..."
-mkdir -p build
+./build-depends.sh "$@"
 cd build
 mkdir -p "$BASE_DIR/build"
 cd "$BASE_DIR/build"
 cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
-make
+
 if [ "$*" = "" ]; then
    make -j$(nproc)
 else
    make "$@"
 fi
 echo "make successful!"
--- a/demos/TTSArmLinux/clean.sh
+++ b/demos/TTSArmLinux/clean.sh
@ -1,8 +1,11 @@
 #!/bin/bash
 set -e
 set -x
 cd "$(dirname "$(realpath "$0")")"
 BASE_DIR="$PWD"
 # load configure
 . ./config.sh
@ -12,3 +15,9 @@ set -x
 rm -rf "$OUTPUT_DIR"
 rm -rf "$LIBS_DIR"
 rm -rf "$MODELS_DIR"
 rm -rf "$BASE_DIR/build"
 "$BASE_DIR/src/TTSCppFrontend/clean.sh"
 # 符号连接
 rm "$BASE_DIR/dict"
--- a/demos/TTSArmLinux/config.sh
+++ b/demos/TTSArmLinux/config.sh
@ -12,3 +12,4 @@ PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_ext
 ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
 VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
 FRONT_CONF="${PWD}/front.conf"
--- a/demos/TTSArmLinux/download.sh
+++ b/demos/TTSArmLinux/download.sh
@ -3,6 +3,8 @@ set -e
 cd "$(dirname "$(realpath "$0")")"
 BASE_DIR="$PWD"
 # load configure
 . ./config.sh
@ -38,6 +40,10 @@ download() {
    echo '======================='
 }
 ########################################
 echo "Download models..."
 download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
    'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
    '39e0c6604f97c70f5d13c573d7e709b9' \
@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
    "$MODELS_DIR"
 echo "Done."
 ########################################
 echo "Download dictionary files..."
 ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
 "$BASE_DIR/src/TTSCppFrontend/download.sh"
--- a/demos/TTSArmLinux/front.conf
+++ b/demos/TTSArmLinux/front.conf
@ -0,0 +1,21 @@
 # jieba conf
 --jieba_dict_path=./dict/jieba/jieba.dict.utf8
 --jieba_hmm_path=./dict/jieba/hmm_model.utf8
 --jieba_user_dict_path=./dict/jieba/user.dict.utf8
 --jieba_idf_path=./dict/jieba/idf.utf8
 --jieba_stop_word_path=./dict/jieba/stop_words.utf8
 # dict conf fastspeech2_0.4
 --seperate_tone=false
 --word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 # dict conf speedyspeech_0.5
 #--seperate_tone=true
 #--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 # dict of tranditional_to_simplified
 --trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt
--- a/demos/TTSArmLinux/run.sh
+++ b/demos/TTSArmLinux/run.sh
@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")"
 . ./config.sh
 # create dir
 rm -rf "$OUTPUT_DIR"
 mkdir -p "$OUTPUT_DIR"
 # run
-for i in {1..10}; do
+set -x
-    (set -x; ./build/paddlespeech_tts_demo "$ACOUSTIC_MODEL_PATH" "$VOCODER_PATH" $i "$OUTPUT_DIR/$i.wav")
+./build/paddlespeech_tts_demo \
-done
+    --front_conf "$FRONT_CONF" \
-
+    --acoustic_model "$ACOUSTIC_MODEL_PATH" \
-ls -lh "$OUTPUT_DIR"/*.wav
+    --vocoder "$VOCODER_PATH" \
    "$@"
 # end
--- a/demos/TTSArmLinux/src/CMakeLists.txt
+++ b/demos/TTSArmLinux/src/CMakeLists.txt
@ -1,4 +1,18 @@
 cmake_minimum_required(VERSION 3.10)
 project(paddlespeech_tts_demo)
 ########## Global Options ##########
 option(WITH_FRONT_DEMO "Build front demo" OFF)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(ABSL_PROPAGATE_CXX_STD ON)
 ########## ARM Options ##########
 set(CMAKE_SYSTEM_NAME Linux)
 if(ARM_ABI STREQUAL "armv8")
    set(CMAKE_SYSTEM_PROCESSOR aarch64)
@ -13,7 +27,9 @@ else()
    return()
 endif()
-project(paddlespeech_tts_demo)
+
 ########## Paddle Lite Options ##########
 message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
 message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf")
    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
 endif()
 ########## Dependencies ##########
 find_package(OpenMP REQUIRED)
 if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@ -43,5 +62,19 @@ else()
    return()
 endif()
 ############### tts cpp frontend ###############
 add_subdirectory(TTSCppFrontend)
 include_directories(
    TTSCppFrontend/src
    third-party/build/src/cppjieba/include
    third-party/build/src/limonp/include
 )
 ############### paddlespeech_tts_demo ###############
 add_executable(paddlespeech_tts_demo main.cc)
-target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared)
+target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)
--- a/demos/TTSArmLinux/src/Predictor.hpp
+++ b/demos/TTSArmLinux/src/Predictor.hpp
@ -9,42 +9,43 @@
 using namespace paddle::lite_api;
 class PredictorInterface {
 public:
    virtual bool Init(
            const std::string &AcousticModelPath,
            const std::string &VocoderPath,
            PowerMode cpuPowerMode,
            int cpuThreadNum,
            // WAV采样率（必须与模型输出匹配）
            // 如果播放速度和音调异常，请修改采样率
            // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
            uint32_t wavSampleRate
    ) = 0;
    virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0;
    virtual void ReleaseModel() = 0;
    virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
    virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0;
    virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0;
    virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0;
    virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
    virtual bool IsLoaded() = 0;
    virtual float GetInferenceTime() = 0;
    virtual int GetWavSize() = 0;
    // 获取WAV持续时间（单位：毫秒）
    virtual float GetWavDuration() = 0;
    // 获取RTF（合成时间 / 音频时长）
    virtual float GetRTF() = 0;
    virtual void ReleaseWav() = 0;
    virtual bool WriteWavToFile(const std::string &wavPath) = 0;
 };
 // WavDataType: WAV数据类型
 // 可在 int16_t 和 float 之间切换，
 // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
 template<typename WavDataType>
-class Predictor {
+class Predictor : public PredictorInterface {
 public:
-    struct WavHeader {
+    virtual bool Init(
        // RIFF 头
        char riff[4] = {'R', 'I', 'F', 'F'};
        uint32_t size = 0;
        char wave[4] = {'W', 'A', 'V', 'E'};
        // FMT 头
        char fmt[4] = {'f', 'm', 't', ' '};
        uint32_t fmt_size = 16;
        uint16_t audio_format = 0;
        uint16_t num_channels = 1;
        uint32_t sample_rate = 0;
        uint32_t byte_rate = 0;
        uint16_t block_align = 0;
        uint16_t bits_per_sample = sizeof(WavDataType) * 8;
        // DATA 头
        char data[4] = {'d', 'a', 't', 'a'};
        uint32_t data_size = 0;
    };
    enum WavAudioFormat {
        WAV_FORMAT_16BIT_PCM   = 1, // 16-bit PCM 格式
        WAV_FORMAT_32BIT_FLOAT = 3  // 32-bit IEEE float 格式
    };
    // 返回值通过模板特化由 WavDataType 决定
    inline uint16_t GetWavAudioFormat();
    bool Init(
            const std::string &AcousticModelPath,
            const std::string &VocoderPath,
            PowerMode cpuPowerMode,
@ -53,7 +54,7 @@ public:
            // 如果播放速度和音调异常，请修改采样率
            // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
            uint32_t wavSampleRate
-    ) {
+    ) override {
        // Release model if exists
        ReleaseModel();
@ -71,12 +72,12 @@ public:
        return true;
    }
-    ~Predictor() {
+    virtual ~Predictor() {
        ReleaseModel();
        ReleaseWav();
    }
-    std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) {
+    virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override {
        if (modelPath.empty()) {
            return nullptr;
        }
@ -90,12 +91,12 @@ public:
        return CreatePaddlePredictor<MobileConfig>(config);
    }
-    void ReleaseModel() {
+    virtual void ReleaseModel() override {
        acoustic_model_predictor_ = nullptr;
        vocoder_predictor_ = nullptr;
    }
-    bool RunModel(const std::vector<int64_t> &phones) {
+    virtual bool RunModel(const std::vector<int64_t> &phones) override {
        if (!IsLoaded()) {
            return false;
        }
@ -116,7 +117,7 @@ public:
        return true;
    }
-    std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) {
+    virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override {
        auto phones_handle = acoustic_model_predictor_->GetInput(0);
        phones_handle->Resize({static_cast<int64_t>(phones.size())});
        phones_handle->CopyFromCpu(phones.data());
@ -135,7 +136,7 @@ public:
        return am_output_handle;
    }
-    std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) {
+    virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override {
        auto mel_handle = vocoder_predictor_->GetInput(0);
        // [?, 80]
        auto dims = amOutput->shape();
@ -157,7 +158,7 @@ public:
        return voc_output_handle;
    }
-    void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) {
+    virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override {
        // 获取输出Tensor的数据
        int64_t output_size = 1;
        for (auto dim : vocOutput->shape()) {
@ -168,17 +169,13 @@ public:
        SaveFloatWav(output_data, output_size);
    }
-    inline float Abs(float number) {
+    virtual void SaveFloatWav(float *floatWav, int64_t size) override;
        return (number < 0) ? -number : number;
    }
    void SaveFloatWav(float *floatWav, int64_t size);
-    bool IsLoaded() {
+    virtual bool IsLoaded() override {
        return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr;
    }
-    float GetInferenceTime() {
+    virtual float GetInferenceTime() override {
        return inference_time_;
    }
@ -186,25 +183,25 @@ public:
        return wav_;
    }
-    int GetWavSize() {
+    virtual int GetWavSize() override {
        return wav_.size() * sizeof(WavDataType);
    }
    // 获取WAV持续时间（单位：毫秒）
-    float GetWavDuration() {
+    virtual float GetWavDuration() override {
        return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000;
    }
    // 获取RTF（合成时间 / 音频时长）
-    float GetRTF() {
+    virtual float GetRTF() override {
        return GetInferenceTime() / GetWavDuration();
    }
-    void ReleaseWav() {
+    virtual void ReleaseWav() override {
        wav_.clear();
    }
-    bool WriteWavToFile(const std::string &wavPath) {
+    virtual bool WriteWavToFile(const std::string &wavPath) override {
        std::ofstream fout(wavPath, std::ios::binary);
        if (!fout.is_open()) {
            return false;
@ -227,7 +224,42 @@ public:
        return true;
    }
-private:
+protected:
    struct WavHeader {
        // RIFF 头
        char riff[4] = {'R', 'I', 'F', 'F'};
        uint32_t size = 0;
        char wave[4] = {'W', 'A', 'V', 'E'};
        // FMT 头
        char fmt[4] = {'f', 'm', 't', ' '};
        uint32_t fmt_size = 16;
        uint16_t audio_format = 0;
        uint16_t num_channels = 1;
        uint32_t sample_rate = 0;
        uint32_t byte_rate = 0;
        uint16_t block_align = 0;
        uint16_t bits_per_sample = sizeof(WavDataType) * 8;
        // DATA 头
        char data[4] = {'d', 'a', 't', 'a'};
        uint32_t data_size = 0;
    };
    enum WavAudioFormat {
        WAV_FORMAT_16BIT_PCM   = 1, // 16-bit PCM 格式
        WAV_FORMAT_32BIT_FLOAT = 3  // 32-bit IEEE float 格式
    };
 protected:
    // 返回值通过模板特化由 WavDataType 决定
    inline uint16_t GetWavAudioFormat();
    inline float Abs(float number) {
        return (number < 0) ? -number : number;
    }
 protected:
    float inference_time_ = 0;
    uint32_t wav_sample_rate_ = 0;
    std::vector<WavDataType> wav_;
--- a/demos/TTSArmLinux/src/TTSCppFrontend
+++ b/demos/TTSArmLinux/src/TTSCppFrontend
@ -0,0 +1 @@
 ../../TTSCppFrontend/
--- a/demos/TTSArmLinux/src/main.cc
+++ b/demos/TTSArmLinux/src/main.cc
@ -1,90 +1,128 @@
 #include <cstdlib>
 #include <iostream>
 #include <memory>
-#include "paddle_api.h"
+#include <string>
 #include <map>
 #include <glog/logging.h>
 #include <gflags/gflags.h>
 #include <paddle_api.h>
 #include <front/front_interface.h>
 #include "Predictor.hpp"
 using namespace paddle::lite_api;
-std::vector<std::vector<int64_t>> sentencesToChoose = {
+DEFINE_string(sentence, "你好，欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)");
-    // 009901 昨日，这名“伤者”与医生全部被警方依法刑事拘留。
+DEFINE_string(front_conf, "./front.conf", "Front configuration file");
-    {261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141},
+DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file");
-    // 009902 钱伟长想到上海来办学校是经过深思熟虑的。
+DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file");
-    {174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45},
+DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
-    // 009903 她见我一进门就骂，吃饭时也骂，骂得我抬不起头。
+DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
-    {182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168},
+DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder");
-    // 009904 李述德在离开之前，只说了一句“柱驼杀父亲了”。
+DEFINE_string(cpu_thread, "1", "CPU thread numbers");
    {153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
    // 009905 这种车票和保险单捆绑出售属于重复性购买。
    {262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
    // 009906 戴佩妮的男友西米露接唱情歌，让她非常开心。
    {40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
    // 009907 观大势、谋大局、出大策始终是该院的办院方针。
    {70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
    // 009908 他们骑着摩托回家，正好为农忙时的父母帮忙。
    {182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
    // 009909 但是因为还没到退休年龄，只能掰着指头捱日子。
    {40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
    // 009910 这几天雨水不断，人们恨不得待在家里不出门。
    {262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52},
 };
 void usage(const char *binName) {
    std::cerr << "Usage:" << std::endl
        << "\t" << binName << " <AM-model-path> <VOC-model-path> <sentences-index:1-10> <output-wav-path>" << std::endl;
 }
 int main(int argc, char *argv[]) {
-    if (argc < 5) {
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
-        usage(argv[0]);
+
    PredictorInterface *predictor;
    if (FLAGS_wav_bit_depth == "16") {
        predictor = new Predictor<int16_t>();
    } else if (FLAGS_wav_bit_depth == "32") {
        predictor = new Predictor<float>();
    } else {
        LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
        return -1;
    }
    /////////////////////////// 前端：文本转音素 ///////////////////////////
    // 实例化文本前端引擎
    speechnn::FrontEngineInterface *front_inst = nullptr;
    front_inst = new speechnn::FrontEngineInterface(FLAGS_front_conf);
    if ((!front_inst) || (front_inst->init())) {
        LOG(ERROR) << "Creater tts engine failed!";
        if (front_inst != nullptr) {
            delete front_inst;
        }
        front_inst = nullptr;
        return -1;
    }
    const char *AMModelPath = argv[1];
    const char *VOCModelPath = argv[2];
    int sentencesIndex = atoi(argv[3]) - 1;
    const char *outputWavPath = argv[4];
-    if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) {
+    std::wstring ws_sentence = speechnn::utf8string2wstring(FLAGS_sentence);
-        std::cerr << "sentences-index out of range" << std::endl;
+
    // 繁体转简体
    std::wstring sentence_simp;
    front_inst->Trand2Simp(ws_sentence, sentence_simp); 
    ws_sentence = sentence_simp;
    std::string s_sentence;
    std::vector<std::wstring> sentence_part;
    std::vector<int> phoneids = {};
    std::vector<int> toneids = {};
    // 根据标点进行分句
    LOG(INFO) << "Start to segment sentences by punctuation";
    front_inst->SplitByPunc(ws_sentence, sentence_part); 
    LOG(INFO) << "Segment sentences through punctuation successfully";
    // 分句后获取音素id
    LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
    for(int i = 0; i < sentence_part.size(); i++) {
        LOG(INFO) << "Raw sentence is: " << speechnn::wstring2utf8string(sentence_part[i]);
        front_inst->SentenceNormalize(sentence_part[i]);
        s_sentence = speechnn::wstring2utf8string(sentence_part[i]);
        LOG(INFO) << "After normalization sentence is: " << s_sentence;
        if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
            LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
            return -1;
        }
-    // 模板参数：WAV数据类型
+    }
-    // 可在 int16_t 和 float 之间切换，
+    LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
-    // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
+    LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
-    Predictor<int16_t> predictor;
+    LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
-    //Predictor<float> predictor;
+ 
    /////////////////////////// 后端：音素转音频 ///////////////////////////
    // WAV采样率（必须与模型输出匹配）
    // 如果播放速度和音调异常，请修改采样率
    // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
-    const uint32_t wavSampleRate = 24000;
+    const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);
    // CPU线程数
-    const int cpuThreadNum = 1;
+    const int cpuThreadNum = std::stol(FLAGS_cpu_thread);
    // CPU电源模式
    const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
-    if (!predictor.Init(AMModelPath, VOCModelPath, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
+    if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
-        std::cerr << "predictor init failed" << std::endl;
+        LOG(ERROR) << "predictor init failed" << std::endl;
        return -1;
    }
-    if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) {
+    std::vector<int64_t> phones(phoneids.size());
-        std::cerr << "predictor run model failed" << std::endl;
+    std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); });
    if (!predictor->RunModel(phones)) {
        LOG(ERROR) << "predictor run model failed" << std::endl;
        return -1;
    }
-    std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, "
+    LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
-              << "WAV size (without header): " << predictor.GetWavSize() << " bytes, "
+              << "WAV size (without header): " << predictor->GetWavSize() << " bytes, "
-              << "WAV duration: " << predictor.GetWavDuration() << " ms, "
+              << "WAV duration: " << predictor->GetWavDuration() << " ms, "
-              << "RTF: " << predictor.GetRTF() << std::endl;
+              << "RTF: " << predictor->GetRTF() << std::endl;
-    if (!predictor.WriteWavToFile(outputWavPath)) {
+    if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
-        std::cerr << "write wav file failed" << std::endl;
+        LOG(ERROR) << "write wav file failed" << std::endl;
        return -1;
    }
    delete predictor;
    return 0;
 }
--- a/demos/TTSArmLinux/src/third-party
+++ b/demos/TTSArmLinux/src/third-party
@ -0,0 +1 @@
 TTSCppFrontend/third-party