demos/TTSArmLinux: Use TTSCppFrontend as the front end to support the synthesis of any Chinese sentences

3 years ago · aaca18216e
parent bc739c7b78
commit aaca18216e
14 changed files with 287 additions and 118 deletions
--- a/demos/TTSArmLinux/.gitignore
+++ b/demos/TTSArmLinux/.gitignore
@ -1,4 +1,8 @@
+# 目录
 build/
 output/
 libs/
 models/
+
+# 符号连接
+dict
--- a/demos/TTSArmLinux/README.md
+++ b/demos/TTSArmLinux/README.md
@ -45,10 +45,14 @@ cd PaddleSpeech/demos/TTSArmLinux

 ```
 ./run.sh
+./run.sh --sentence "语音合成测试"
+./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
+./run.sh --help
 ```

-将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件，保存在 `output` 文件夹中。
+目前只支持中文合成，出现任何英文都会导致程序崩溃。

+如果未指定`--wav_file`，默认输出到`./output/tts.wav`。

 ## 手动编译 Paddle Lite 库

--- a/demos/TTSArmLinux/build-depends.sh
+++ b/demos/TTSArmLinux/build-depends.sh
@ -0,0 +1 @@
+src/TTSCppFrontend/build-depends.sh
--- a/demos/TTSArmLinux/build.sh
+++ b/demos/TTSArmLinux/build.sh
@ -1,8 +1,11 @@
 #!/bin/bash
 set -e
+set -x

 cd "$(dirname "$(realpath "$0")")"

+BASE_DIR="$PWD"
+
 # load configure
 . ./config.sh

@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")"
 echo "ARM_ABI is ${ARM_ABI}"
 echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"

-rm -rf build
-mkdir -p build
-cd build
+echo "Build depends..."
+./build-depends.sh "$@"

+mkdir -p "$BASE_DIR/build"
+cd "$BASE_DIR/build"
 cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
-make
+
+if [ "$*" = "" ]; then
+    make -j$(nproc)
+else
+    make "$@"
+fi

 echo "make successful!"
--- a/demos/TTSArmLinux/clean.sh
+++ b/demos/TTSArmLinux/clean.sh
@ -1,8 +1,11 @@
 #!/bin/bash
 set -e
+set -x

 cd "$(dirname "$(realpath "$0")")"

+BASE_DIR="$PWD"
+
 # load configure
 . ./config.sh

@ -12,3 +15,9 @@ set -x
 rm -rf "$OUTPUT_DIR"
 rm -rf "$LIBS_DIR"
 rm -rf "$MODELS_DIR"
+rm -rf "$BASE_DIR/build"
+
+"$BASE_DIR/src/TTSCppFrontend/clean.sh"
+
+# 符号连接
+rm "$BASE_DIR/dict"
--- a/demos/TTSArmLinux/config.sh
+++ b/demos/TTSArmLinux/config.sh
@ -12,3 +12,4 @@ PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_ext

 ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
 VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
+FRONT_CONF="${PWD}/front.conf"
--- a/demos/TTSArmLinux/download.sh
+++ b/demos/TTSArmLinux/download.sh
@ -3,6 +3,8 @@ set -e

 cd "$(dirname "$(realpath "$0")")"

+BASE_DIR="$PWD"
+
 # load configure
 . ./config.sh

@ -38,6 +40,10 @@ download() {
    echo '======================='
 }

+########################################
+
+echo "Download models..."
+
 download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
    'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
    '39e0c6604f97c70f5d13c573d7e709b9' \
@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
    "$MODELS_DIR"

 echo "Done."
+
+########################################
+
+echo "Download dictionary files..."
+
+ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
+
+"$BASE_DIR/src/TTSCppFrontend/download.sh"
--- a/demos/TTSArmLinux/front.conf
+++ b/demos/TTSArmLinux/front.conf
@ -0,0 +1,21 @@
+# jieba conf
+--jieba_dict_path=./dict/jieba/jieba.dict.utf8
+--jieba_hmm_path=./dict/jieba/hmm_model.utf8
+--jieba_user_dict_path=./dict/jieba/user.dict.utf8
+--jieba_idf_path=./dict/jieba/idf.utf8
+--jieba_stop_word_path=./dict/jieba/stop_words.utf8
+
+# dict conf fastspeech2_0.4
+--seperate_tone=false
+--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+
+# dict conf speedyspeech_0.5
+#--seperate_tone=true
+#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
+#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
+#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+
+# dict of tranditional_to_simplified
+--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt
--- a/demos/TTSArmLinux/run.sh
+++ b/demos/TTSArmLinux/run.sh
@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")"
 . ./config.sh

 # create dir
-rm -rf "$OUTPUT_DIR"
 mkdir -p "$OUTPUT_DIR"

 # run
-for i in {1..10}; do
-    (set -x; ./build/paddlespeech_tts_demo "$ACOUSTIC_MODEL_PATH" "$VOCODER_PATH" $i "$OUTPUT_DIR/$i.wav")
-done
-
-ls -lh "$OUTPUT_DIR"/*.wav
+set -x
+./build/paddlespeech_tts_demo \
+    --front_conf "$FRONT_CONF" \
+    --acoustic_model "$ACOUSTIC_MODEL_PATH" \
+    --vocoder "$VOCODER_PATH" \
+    "$@"
+# end
--- a/demos/TTSArmLinux/src/CMakeLists.txt
+++ b/demos/TTSArmLinux/src/CMakeLists.txt
@ -1,4 +1,18 @@
 cmake_minimum_required(VERSION 3.10)
+project(paddlespeech_tts_demo)
+
+
+########## Global Options ##########
+
+option(WITH_FRONT_DEMO "Build front demo" OFF)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+
+
+########## ARM Options ##########
+
 set(CMAKE_SYSTEM_NAME Linux)
 if(ARM_ABI STREQUAL "armv8")
    set(CMAKE_SYSTEM_PROCESSOR aarch64)
@ -13,7 +27,9 @@ else()
    return()
 endif()

-project(paddlespeech_tts_demo)
+
+########## Paddle Lite Options ##########
+
 message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
 message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")

@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf")
    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
 endif()

+
+########## Dependencies ##########
+
 find_package(OpenMP REQUIRED)
 if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@ -43,5 +62,19 @@ else()
    return()
 endif()

+
+############### tts cpp frontend ###############
+
+add_subdirectory(TTSCppFrontend)
+
+include_directories(
+    TTSCppFrontend/src
+    third-party/build/src/cppjieba/include
+    third-party/build/src/limonp/include
+)
+
+
+############### paddlespeech_tts_demo ###############
+
 add_executable(paddlespeech_tts_demo main.cc)
-target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared)
+target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)
--- a/demos/TTSArmLinux/src/Predictor.hpp
+++ b/demos/TTSArmLinux/src/Predictor.hpp
@ -9,42 +9,43 @@

 using namespace paddle::lite_api;

+class PredictorInterface {
+public:
+    virtual bool Init(
+            const std::string &AcousticModelPath,
+            const std::string &VocoderPath,
+            PowerMode cpuPowerMode,
+            int cpuThreadNum,
+            // WAV采样率（必须与模型输出匹配）
+            // 如果播放速度和音调异常，请修改采样率
+            // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
+            uint32_t wavSampleRate
+    ) = 0;
+    virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0;
+    virtual void ReleaseModel() = 0;
+    virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
+    virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0;
+    virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0;
+    virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0;
+    virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
+    virtual bool IsLoaded() = 0;
+    virtual float GetInferenceTime() = 0;
+    virtual int GetWavSize() = 0;
+    // 获取WAV持续时间（单位：毫秒）
+    virtual float GetWavDuration() = 0;
+    // 获取RTF（合成时间 / 音频时长）
+    virtual float GetRTF() = 0;
+    virtual void ReleaseWav() = 0;
+    virtual bool WriteWavToFile(const std::string &wavPath) = 0;
+};
+
 // WavDataType: WAV数据类型
 // 可在 int16_t 和 float 之间切换，
 // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
 template<typename WavDataType>
-class Predictor {
+class Predictor : public PredictorInterface {
 public:
-    struct WavHeader {
-        // RIFF 头
-        char riff[4] = {'R', 'I', 'F', 'F'};
-        uint32_t size = 0;
-        char wave[4] = {'W', 'A', 'V', 'E'};
-
-        // FMT 头
-        char fmt[4] = {'f', 'm', 't', ' '};
-        uint32_t fmt_size = 16;
-        uint16_t audio_format = 0;
-        uint16_t num_channels = 1;
-        uint32_t sample_rate = 0;
-        uint32_t byte_rate = 0;
-        uint16_t block_align = 0;
-        uint16_t bits_per_sample = sizeof(WavDataType) * 8;
-
-        // DATA 头
-        char data[4] = {'d', 'a', 't', 'a'};
-        uint32_t data_size = 0;
-    };
-
-    enum WavAudioFormat {
-        WAV_FORMAT_16BIT_PCM   = 1, // 16-bit PCM 格式
-        WAV_FORMAT_32BIT_FLOAT = 3  // 32-bit IEEE float 格式
-    };
-
-    // 返回值通过模板特化由 WavDataType 决定
-    inline uint16_t GetWavAudioFormat();
-
-    bool Init(
+    virtual bool Init(
            const std::string &AcousticModelPath,
            const std::string &VocoderPath,
            PowerMode cpuPowerMode,
@ -53,7 +54,7 @@ public:
            // 如果播放速度和音调异常，请修改采样率
            // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
            uint32_t wavSampleRate
-    ) {
+    ) override {
        // Release model if exists
        ReleaseModel();

@ -71,12 +72,12 @@ public:
        return true;
    }

-    ~Predictor() {
+    virtual ~Predictor() {
        ReleaseModel();
        ReleaseWav();
    }

-    std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) {
+    virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override {
        if (modelPath.empty()) {
            return nullptr;
        }
@ -90,12 +91,12 @@ public:
        return CreatePaddlePredictor<MobileConfig>(config);
    }

-    void ReleaseModel() {
+    virtual void ReleaseModel() override {
        acoustic_model_predictor_ = nullptr;
        vocoder_predictor_ = nullptr;
    }

-    bool RunModel(const std::vector<int64_t> &phones) {
+    virtual bool RunModel(const std::vector<int64_t> &phones) override {
        if (!IsLoaded()) {
            return false;
        }
@ -116,7 +117,7 @@ public:
        return true;
    }

-    std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) {
+    virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override {
        auto phones_handle = acoustic_model_predictor_->GetInput(0);
        phones_handle->Resize({static_cast<int64_t>(phones.size())});
        phones_handle->CopyFromCpu(phones.data());
@ -135,7 +136,7 @@ public:
        return am_output_handle;
    }

-    std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) {
+    virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override {
        auto mel_handle = vocoder_predictor_->GetInput(0);
        // [?, 80]
        auto dims = amOutput->shape();
@ -157,7 +158,7 @@ public:
        return voc_output_handle;
    }

-    void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) {
+    virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override {
        // 获取输出Tensor的数据
        int64_t output_size = 1;
        for (auto dim : vocOutput->shape()) {
@ -168,17 +169,13 @@ public:
        SaveFloatWav(output_data, output_size);
    }

-    inline float Abs(float number) {
-        return (number < 0) ? -number : number;
-    }
-
-    void SaveFloatWav(float *floatWav, int64_t size);
+    virtual void SaveFloatWav(float *floatWav, int64_t size) override;

-    bool IsLoaded() {
+    virtual bool IsLoaded() override {
        return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr;
    }

-    float GetInferenceTime() {
+    virtual float GetInferenceTime() override {
        return inference_time_;
    }

@ -186,25 +183,25 @@ public:
        return wav_;
    }

-    int GetWavSize() {
+    virtual int GetWavSize() override {
        return wav_.size() * sizeof(WavDataType);
    }

    // 获取WAV持续时间（单位：毫秒）
-    float GetWavDuration() {
+    virtual float GetWavDuration() override {
        return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000;
    }

    // 获取RTF（合成时间 / 音频时长）
-    float GetRTF() {
+    virtual float GetRTF() override {
        return GetInferenceTime() / GetWavDuration();
    }

-    void ReleaseWav() {
+    virtual void ReleaseWav() override {
        wav_.clear();
    }

-    bool WriteWavToFile(const std::string &wavPath) {
+    virtual bool WriteWavToFile(const std::string &wavPath) override {
        std::ofstream fout(wavPath, std::ios::binary);
        if (!fout.is_open()) {
            return false;
@ -227,7 +224,42 @@ public:
        return true;
    }

-private:
+protected:
+    struct WavHeader {
+        // RIFF 头
+        char riff[4] = {'R', 'I', 'F', 'F'};
+        uint32_t size = 0;
+        char wave[4] = {'W', 'A', 'V', 'E'};
+
+        // FMT 头
+        char fmt[4] = {'f', 'm', 't', ' '};
+        uint32_t fmt_size = 16;
+        uint16_t audio_format = 0;
+        uint16_t num_channels = 1;
+        uint32_t sample_rate = 0;
+        uint32_t byte_rate = 0;
+        uint16_t block_align = 0;
+        uint16_t bits_per_sample = sizeof(WavDataType) * 8;
+
+        // DATA 头
+        char data[4] = {'d', 'a', 't', 'a'};
+        uint32_t data_size = 0;
+    };
+
+    enum WavAudioFormat {
+        WAV_FORMAT_16BIT_PCM   = 1, // 16-bit PCM 格式
+        WAV_FORMAT_32BIT_FLOAT = 3  // 32-bit IEEE float 格式
+    };
+
+protected:
+    // 返回值通过模板特化由 WavDataType 决定
+    inline uint16_t GetWavAudioFormat();
+
+    inline float Abs(float number) {
+        return (number < 0) ? -number : number;
+    }
+
+protected:
    float inference_time_ = 0;
    uint32_t wav_sample_rate_ = 0;
    std::vector<WavDataType> wav_;
--- a/demos/TTSArmLinux/src/TTSCppFrontend
+++ b/demos/TTSArmLinux/src/TTSCppFrontend
@ -0,0 +1 @@
+../../TTSCppFrontend/
--- a/demos/TTSArmLinux/src/main.cc
+++ b/demos/TTSArmLinux/src/main.cc
@ -1,90 +1,128 @@
 #include <cstdlib>
 #include <iostream>
 #include <memory>
-#include "paddle_api.h"
+#include <string>
+#include <map>
+#include <glog/logging.h>
+#include <gflags/gflags.h>
+#include <paddle_api.h>
+#include <front/front_interface.h>
 #include "Predictor.hpp"

 using namespace paddle::lite_api;

-std::vector<std::vector<int64_t>> sentencesToChoose = {
-    // 009901 昨日，这名“伤者”与医生全部被警方依法刑事拘留。
-    {261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141},
-    // 009902 钱伟长想到上海来办学校是经过深思熟虑的。
-    {174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45},
-    // 009903 她见我一进门就骂，吃饭时也骂，骂得我抬不起头。
-    {182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168},
-    // 009904 李述德在离开之前，只说了一句“柱驼杀父亲了”。
-    {153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
-    // 009905 这种车票和保险单捆绑出售属于重复性购买。
-    {262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
-    // 009906 戴佩妮的男友西米露接唱情歌，让她非常开心。
-    {40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
-    // 009907 观大势、谋大局、出大策始终是该院的办院方针。
-    {70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
-    // 009908 他们骑着摩托回家，正好为农忙时的父母帮忙。
-    {182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
-    // 009909 但是因为还没到退休年龄，只能掰着指头捱日子。
-    {40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
-    // 009910 这几天雨水不断，人们恨不得待在家里不出门。
-    {262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52},
-};
-
-void usage(const char *binName) {
-    std::cerr << "Usage:" << std::endl
-        << "\t" << binName << " <AM-model-path> <VOC-model-path> <sentences-index:1-10> <output-wav-path>" << std::endl;
-}
+DEFINE_string(sentence, "你好，欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)");
+DEFINE_string(front_conf, "./front.conf", "Front configuration file");
+DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file");
+DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file");
+DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
+DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
+DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder");
+DEFINE_string(cpu_thread, "1", "CPU thread numbers");

 int main(int argc, char *argv[]) {
-    if (argc < 5) {
-        usage(argv[0]);
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    PredictorInterface *predictor;
+
+    if (FLAGS_wav_bit_depth == "16") {
+        predictor = new Predictor<int16_t>();
+    } else if (FLAGS_wav_bit_depth == "32") {
+        predictor = new Predictor<float>();
+    } else {
+        LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
        return -1;
    }
-    const char *AMModelPath = argv[1];
-    const char *VOCModelPath = argv[2];
-    int sentencesIndex = atoi(argv[3]) - 1;
-    const char *outputWavPath = argv[4];

-    if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) {
-        std::cerr << "sentences-index out of range" << std::endl;
+
+    /////////////////////////// 前端：文本转音素 ///////////////////////////
+
+    // 实例化文本前端引擎
+    speechnn::FrontEngineInterface *front_inst = nullptr;
+    front_inst = new speechnn::FrontEngineInterface(FLAGS_front_conf);
+    if ((!front_inst) || (front_inst->init())) {
+        LOG(ERROR) << "Creater tts engine failed!";
+        if (front_inst != nullptr) {
+            delete front_inst;
+        }
+        front_inst = nullptr;
        return -1;
    }

-    // 模板参数：WAV数据类型
-    // 可在 int16_t 和 float 之间切换，
-    // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
-    Predictor<int16_t> predictor;
-    //Predictor<float> predictor;
+    std::wstring ws_sentence = speechnn::utf8string2wstring(FLAGS_sentence);
+
+    // 繁体转简体
+    std::wstring sentence_simp;
+    front_inst->Trand2Simp(ws_sentence, sentence_simp); 
+    ws_sentence = sentence_simp;
+
+    std::string s_sentence;
+    std::vector<std::wstring> sentence_part;
+    std::vector<int> phoneids = {};
+    std::vector<int> toneids = {};
+
+    // 根据标点进行分句
+    LOG(INFO) << "Start to segment sentences by punctuation";
+    front_inst->SplitByPunc(ws_sentence, sentence_part); 
+    LOG(INFO) << "Segment sentences through punctuation successfully";
+
+    // 分句后获取音素id
+    LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
+    for(int i = 0; i < sentence_part.size(); i++) {
+
+        LOG(INFO) << "Raw sentence is: " << speechnn::wstring2utf8string(sentence_part[i]);
+        front_inst->SentenceNormalize(sentence_part[i]);
+        s_sentence = speechnn::wstring2utf8string(sentence_part[i]);
+        LOG(INFO) << "After normalization sentence is: " << s_sentence;
+        
+        if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
+            LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
+            return -1;
+        }
+            
+    }
+    LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
+    LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
+    LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
+ 
+
+    /////////////////////////// 后端：音素转音频 ///////////////////////////

    // WAV采样率（必须与模型输出匹配）
    // 如果播放速度和音调异常，请修改采样率
    // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
-    const uint32_t wavSampleRate = 24000;
+    const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);

    // CPU线程数
-    const int cpuThreadNum = 1;
+    const int cpuThreadNum = std::stol(FLAGS_cpu_thread);

    // CPU电源模式
    const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;

-    if (!predictor.Init(AMModelPath, VOCModelPath, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
-        std::cerr << "predictor init failed" << std::endl;
+    if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
+        LOG(ERROR) << "predictor init failed" << std::endl;
        return -1;
    }

-    if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) {
-        std::cerr << "predictor run model failed" << std::endl;
+    std::vector<int64_t> phones(phoneids.size());
+    std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); });
+
+    if (!predictor->RunModel(phones)) {
+        LOG(ERROR) << "predictor run model failed" << std::endl;
        return -1;
    }

-    std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, "
-              << "WAV size (without header): " << predictor.GetWavSize() << " bytes, "
-              << "WAV duration: " << predictor.GetWavDuration() << " ms, "
-              << "RTF: " << predictor.GetRTF() << std::endl;
+    LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
+              << "WAV size (without header): " << predictor->GetWavSize() << " bytes, "
+              << "WAV duration: " << predictor->GetWavDuration() << " ms, "
+              << "RTF: " << predictor->GetRTF() << std::endl;

-    if (!predictor.WriteWavToFile(outputWavPath)) {
-        std::cerr << "write wav file failed" << std::endl;
+    if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
+        LOG(ERROR) << "write wav file failed" << std::endl;
        return -1;
    }

+    delete predictor;
+
    return 0;
 }
--- a/demos/TTSArmLinux/src/third-party
+++ b/demos/TTSArmLinux/src/third-party
@ -0,0 +1 @@
+TTSCppFrontend/third-party