[TTS][Paddle-Lite] Add Chinese C++ TTS Frontend, let TTS Arm Linux demo support the synthesis of arbitrary Chinese sentences (#3018)

* [TTS] add a TTS CPP frontend demo
pull/3031/head
老虎会游泳 1 year ago committed by GitHub
parent 1afd14acd9
commit 34f2995bcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,4 +1,8 @@
# 目录
build/
output/
libs/
models/
# 符号连接
dict

@ -12,7 +12,7 @@
```
# Ubuntu
sudo apt install build-essential cmake wget tar unzip
sudo apt install build-essential cmake pkg-config wget tar unzip
# CentOS
sudo yum groupinstall "Development Tools"
@ -45,10 +45,14 @@ cd PaddleSpeech/demos/TTSArmLinux
```
./run.sh
./run.sh --sentence "语音合成测试"
./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
./run.sh --help
```
将把 [src/main.cpp](src/main.cpp) 里定义在 `sentencesToChoose` 数组中的十句话转换为 `wav` 文件,保存在 `output` 文件夹中。
目前只支持中文合成,出现任何英文都会导致程序崩溃
如果未指定`--wav_file`,默认输出到`./output/tts.wav`。
## 手动编译 Paddle Lite 库

@ -0,0 +1 @@
src/TTSCppFrontend/build-depends.sh

@ -1,8 +1,11 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure
. ./config.sh
@ -10,11 +13,17 @@ cd "$(dirname "$(realpath "$0")")"
echo "ARM_ABI is ${ARM_ABI}"
echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"
rm -rf build
mkdir -p build
cd build
echo "Build depends..."
./build-depends.sh "$@"
mkdir -p "$BASE_DIR/build"
cd "$BASE_DIR/build"
cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
make
if [ "$*" = "" ]; then
make -j$(nproc)
else
make "$@"
fi
echo "make successful!"

@ -1,8 +1,11 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure
. ./config.sh
@ -12,3 +15,9 @@ set -x
rm -rf "$OUTPUT_DIR"
rm -rf "$LIBS_DIR"
rm -rf "$MODELS_DIR"
rm -rf "$BASE_DIR/build"
"$BASE_DIR/src/TTSCppFrontend/clean.sh"
# 符号连接
rm "$BASE_DIR/dict"

@ -10,5 +10,6 @@ OUTPUT_DIR="${PWD}/output"
PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_extra.with_cv/cxx"
#PADDLE_LITE_DIR="/path/to/Paddle-Lite/build.lite.linux.${ARM_ABI}.gcc/inference_lite_lib.armlinux.${ARM_ABI}/cxx"
AM_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
VOC_MODEL_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
FRONT_CONF="${PWD}/front.conf"

@ -3,6 +3,8 @@ set -e
cd "$(dirname "$(realpath "$0")")"
BASE_DIR="$PWD"
# load configure
. ./config.sh
@ -38,6 +40,10 @@ download() {
echo '======================='
}
########################################
echo "Download models..."
download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
'39e0c6604f97c70f5d13c573d7e709b9' \
@ -54,3 +60,11 @@ download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
"$MODELS_DIR"
echo "Done."
########################################
echo "Download dictionary files..."
ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
"$BASE_DIR/src/TTSCppFrontend/download.sh"

@ -0,0 +1,21 @@
# jieba conf
--jieba_dict_path=./dict/jieba/jieba.dict.utf8
--jieba_hmm_path=./dict/jieba/hmm_model.utf8
--jieba_user_dict_path=./dict/jieba/user.dict.utf8
--jieba_idf_path=./dict/jieba/idf.utf8
--jieba_stop_word_path=./dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
--seperate_tone=false
--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
#--seperate_tone=true
#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
# dict of tranditional_to_simplified
--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt

@ -7,12 +7,13 @@ cd "$(dirname "$(realpath "$0")")"
. ./config.sh
# create dir
rm -rf "$OUTPUT_DIR"
mkdir -p "$OUTPUT_DIR"
# run
for i in {1..10}; do
(set -x; ./build/paddlespeech_tts_demo "$AM_MODEL_PATH" "$VOC_MODEL_PATH" $i "$OUTPUT_DIR/$i.wav")
done
ls -lh "$OUTPUT_DIR"/*.wav
set -x
./build/paddlespeech_tts_demo \
--front_conf "$FRONT_CONF" \
--acoustic_model "$ACOUSTIC_MODEL_PATH" \
--vocoder "$VOCODER_PATH" \
"$@"
# end

@ -1,4 +1,18 @@
cmake_minimum_required(VERSION 3.10)
project(paddlespeech_tts_demo)
########## Global Options ##########
option(WITH_FRONT_DEMO "Build front demo" OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(ABSL_PROPAGATE_CXX_STD ON)
########## ARM Options ##########
set(CMAKE_SYSTEM_NAME Linux)
if(ARM_ABI STREQUAL "armv8")
set(CMAKE_SYSTEM_PROCESSOR aarch64)
@ -13,14 +27,16 @@ else()
return()
endif()
project(paddlespeech_tts_demo)
########## Paddle Lite Options ##########
message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
include_directories(${PADDLE_LITE_DIR}/include)
link_directories(${PADDLE_LITE_DIR}/libs/${ARM_ABI})
link_directories(${PADDLE_LITE_DIR}/lib)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if(ARM_ABI STREQUAL "armv8")
set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
@ -29,6 +45,9 @@ elseif(ARM_ABI STREQUAL "armv7hf")
set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
endif()
########## Dependencies ##########
find_package(OpenMP REQUIRED)
if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@ -43,5 +62,19 @@ else()
return()
endif()
############### tts cpp frontend ###############
add_subdirectory(TTSCppFrontend)
include_directories(
TTSCppFrontend/src
third-party/build/src/cppjieba/include
third-party/build/src/limonp/include
)
############### paddlespeech_tts_demo ###############
add_executable(paddlespeech_tts_demo main.cc)
target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared)
target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)

@ -9,32 +9,78 @@
using namespace paddle::lite_api;
typedef int16_t WavDataType;
class PredictorInterface {
public:
virtual ~PredictorInterface() = 0;
virtual bool Init(
const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
int cpuThreadNum,
// WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t wavSampleRate
) = 0;
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0;
virtual void ReleaseModel() = 0;
virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0;
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0;
virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
virtual bool IsLoaded() = 0;
virtual float GetInferenceTime() = 0;
virtual int GetWavSize() = 0;
// 获取WAV持续时间单位毫秒
virtual float GetWavDuration() = 0;
// 获取RTF合成时间 / 音频时长)
virtual float GetRTF() = 0;
virtual void ReleaseWav() = 0;
virtual bool WriteWavToFile(const std::string &wavPath) = 0;
};
PredictorInterface::~PredictorInterface() {}
class Predictor {
// WavDataType: WAV数据类型
// 可在 int16_t 和 float 之间切换,
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
template<typename WavDataType>
class Predictor : public PredictorInterface {
public:
bool Init(const std::string &AMModelPath, const std::string &VOCModelPath, int cpuThreadNum, const std::string &cpuPowerMode) {
virtual bool Init(
const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
int cpuThreadNum,
// WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t wavSampleRate
) override {
// Release model if exists
ReleaseModel();
AM_predictor_ = LoadModel(AMModelPath, cpuThreadNum, cpuPowerMode);
if (AM_predictor_ == nullptr) {
acoustic_model_predictor_ = LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
if (acoustic_model_predictor_ == nullptr) {
return false;
}
VOC_predictor_ = LoadModel(VOCModelPath, cpuThreadNum, cpuPowerMode);
if (VOC_predictor_ == nullptr) {
vocoder_predictor_ = LoadModel(VocoderPath, cpuThreadNum, cpuPowerMode);
if (vocoder_predictor_ == nullptr) {
return false;
}
wav_sample_rate_ = wavSampleRate;
return true;
}
~Predictor() {
virtual ~Predictor() {
ReleaseModel();
ReleaseWav();
}
std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, const std::string &cpuPowerMode) {
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override {
if (modelPath.empty()) {
return nullptr;
}
@ -43,33 +89,17 @@ public:
MobileConfig config;
config.set_model_from_file(modelPath);
config.set_threads(cpuThreadNum);
if (cpuPowerMode == "LITE_POWER_HIGH") {
config.set_power_mode(PowerMode::LITE_POWER_HIGH);
} else if (cpuPowerMode == "LITE_POWER_LOW") {
config.set_power_mode(PowerMode::LITE_POWER_LOW);
} else if (cpuPowerMode == "LITE_POWER_FULL") {
config.set_power_mode(PowerMode::LITE_POWER_FULL);
} else if (cpuPowerMode == "LITE_POWER_NO_BIND") {
config.set_power_mode(PowerMode::LITE_POWER_NO_BIND);
} else if (cpuPowerMode == "LITE_POWER_RAND_HIGH") {
config.set_power_mode(PowerMode::LITE_POWER_RAND_HIGH);
} else if (cpuPowerMode == "LITE_POWER_RAND_LOW") {
config.set_power_mode(PowerMode::LITE_POWER_RAND_LOW);
} else {
std::cerr << "Unknown cpu power mode!" << std::endl;
return nullptr;
}
config.set_power_mode(cpuPowerMode);
return CreatePaddlePredictor<MobileConfig>(config);
}
void ReleaseModel() {
AM_predictor_ = nullptr;
VOC_predictor_ = nullptr;
virtual void ReleaseModel() override {
acoustic_model_predictor_ = nullptr;
vocoder_predictor_ = nullptr;
}
bool RunModel(const std::vector<int64_t> &phones) {
virtual bool RunModel(const std::vector<int64_t> &phones) override {
if (!IsLoaded()) {
return false;
}
@ -78,7 +108,7 @@ public:
auto start = std::chrono::system_clock::now();
// 执行推理
VOCOutputToWav(GetAMOutput(phones));
VocoderOutputToWav(GetVocoderOutput(GetAcousticModelOutput(phones)));
// 计时结束
auto end = std::chrono::system_clock::now();
@ -90,16 +120,16 @@ public:
return true;
}
std::unique_ptr<const Tensor> GetAMOutput(const std::vector<int64_t> &phones) {
auto phones_handle = AM_predictor_->GetInput(0);
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override {
auto phones_handle = acoustic_model_predictor_->GetInput(0);
phones_handle->Resize({static_cast<int64_t>(phones.size())});
phones_handle->CopyFromCpu(phones.data());
AM_predictor_->Run();
acoustic_model_predictor_->Run();
// 获取输出Tensor
auto am_output_handle = AM_predictor_->GetOutput(0);
auto am_output_handle = acoustic_model_predictor_->GetOutput(0);
// 打印输出Tensor的shape
std::cout << "AM Output shape: ";
std::cout << "Acoustic Model Output shape: ";
auto shape = am_output_handle->shape();
for (auto s : shape) {
std::cout << s << ", ";
@ -109,60 +139,46 @@ public:
return am_output_handle;
}
void VOCOutputToWav(std::unique_ptr<const Tensor> &&input) {
auto mel_handle = VOC_predictor_->GetInput(0);
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override {
auto mel_handle = vocoder_predictor_->GetInput(0);
// [?, 80]
auto dims = input->shape();
auto dims = amOutput->shape();
mel_handle->Resize(dims);
auto am_output_data = input->mutable_data<float>();
auto am_output_data = amOutput->mutable_data<float>();
mel_handle->CopyFromCpu(am_output_data);
VOC_predictor_->Run();
vocoder_predictor_->Run();
// 获取输出Tensor
auto voc_output_handle = VOC_predictor_->GetOutput(0);
auto voc_output_handle = vocoder_predictor_->GetOutput(0);
// 打印输出Tensor的shape
std::cout << "VOC Output shape: ";
std::cout << "Vocoder Output shape: ";
auto shape = voc_output_handle->shape();
for (auto s : shape) {
std::cout << s << ", ";
}
std::cout << std::endl;
return voc_output_handle;
}
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override {
// 获取输出Tensor的数据
int64_t output_size = 1;
for (auto dim : voc_output_handle->shape()) {
for (auto dim : vocOutput->shape()) {
output_size *= dim;
}
auto output_data = voc_output_handle->mutable_data<float>();
auto output_data = vocOutput->mutable_data<float>();
SaveFloatWav(output_data, output_size);
}
inline float Abs(float number) {
return (number < 0) ? -number : number;
}
void SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size);
float maxSample = 0.01;
// 寻找最大采样值
for (int64_t i=0; i<size; i++) {
float sample = Abs(floatWav[i]);
if (sample > maxSample) {
maxSample = sample;
}
}
// 把采样值缩放到 int_16 范围
for (int64_t i=0; i<size; i++) {
wav_[i] = floatWav[i] * 32767.0f / maxSample;
}
}
virtual void SaveFloatWav(float *floatWav, int64_t size) override;
bool IsLoaded() {
return AM_predictor_ != nullptr && VOC_predictor_ != nullptr;
virtual bool IsLoaded() override {
return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr;
}
float GetInferenceTime() {
virtual float GetInferenceTime() override {
return inference_time_;
}
@ -170,40 +186,25 @@ public:
return wav_;
}
int GetWavSize() {
virtual int GetWavSize() override {
return wav_.size() * sizeof(WavDataType);
}
void ReleaseWav() {
wav_.clear();
// 获取WAV持续时间单位毫秒
virtual float GetWavDuration() override {
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000;
}
struct WavHeader {
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 1; // 1为整数编码3为浮点编码
uint16_t num_channels = 1;
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
uint32_t sample_rate = 24000;
uint32_t byte_rate = 64000;
uint16_t block_align = 2;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// 获取RTF合成时间 / 音频时长)
virtual float GetRTF() override {
return GetInferenceTime() / GetWavDuration();
}
// DATA 头
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
virtual void ReleaseWav() override {
wav_.clear();
}
bool WriteWavToFile(const std::string &wavPath) {
virtual bool WriteWavToFile(const std::string &wavPath) override {
std::ofstream fout(wavPath, std::ios::binary);
if (!fout.is_open()) {
return false;
@ -211,8 +212,10 @@ public:
// 写入头信息
WavHeader header;
header.audio_format = GetWavAudioFormat();
header.data_size = GetWavSize();
header.size = sizeof(header) - 8 + header.data_size;
header.sample_rate = wav_sample_rate_;
header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8;
header.block_align = header.num_channels * header.bits_per_sample / 8;
fout.write(reinterpret_cast<const char*>(&header), sizeof(header));
@ -224,9 +227,80 @@ public:
return true;
}
private:
protected:
struct WavHeader {
// RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
// FMT 头
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 0;
uint16_t num_channels = 1;
uint32_t sample_rate = 0;
uint32_t byte_rate = 0;
uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
// DATA 头
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
};
protected:
// 返回值通过模板特化由 WavDataType 决定
inline uint16_t GetWavAudioFormat();
inline float Abs(float number) {
return (number < 0) ? -number : number;
}
protected:
float inference_time_ = 0;
std::shared_ptr<PaddlePredictor> AM_predictor_ = nullptr;
std::shared_ptr<PaddlePredictor> VOC_predictor_ = nullptr;
uint32_t wav_sample_rate_ = 0;
std::vector<WavDataType> wav_;
std::shared_ptr<PaddlePredictor> acoustic_model_predictor_ = nullptr;
std::shared_ptr<PaddlePredictor> vocoder_predictor_ = nullptr;
};
template<>
uint16_t Predictor<int16_t>::GetWavAudioFormat() {
return Predictor::WAV_FORMAT_16BIT_PCM;
}
template<>
uint16_t Predictor<float>::GetWavAudioFormat() {
return Predictor::WAV_FORMAT_32BIT_FLOAT;
}
// 保存 16-bit PCM 格式 WAV
template<>
void Predictor<int16_t>::SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size);
float maxSample = 0.01;
// 寻找最大采样值
for (int64_t i=0; i<size; i++) {
float sample = Abs(floatWav[i]);
if (sample > maxSample) {
maxSample = sample;
}
}
// 把采样值缩放到 int_16 范围
for (int64_t i=0; i<size; i++) {
wav_[i] = floatWav[i] * 32767.0f / maxSample;
}
}
// 保存 32-bit IEEE float 格式 WAV
template<>
void Predictor<float>::SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size);
std::copy_n(floatWav, size, wav_.data());
}

@ -0,0 +1 @@
../../TTSCppFrontend/

@ -1,72 +1,128 @@
#include <cstdlib>
#include <iostream>
#include <memory>
#include "paddle_api.h"
#include <string>
#include <map>
#include <glog/logging.h>
#include <gflags/gflags.h>
#include <paddle_api.h>
#include <front/front_interface.h>
#include "Predictor.hpp"
using namespace paddle::lite_api;
std::vector<std::vector<int64_t>> sentencesToChoose = {
// 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。
{261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141},
// 009902 钱伟长想到上海来办学校是经过深思熟虑的。
{174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45},
// 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。
{182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168},
// 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。
{153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
// 009905 这种车票和保险单捆绑出售属于重复性购买。
{262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
// 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。
{40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
// 009907 观大势、谋大局、出大策始终是该院的办院方针。
{70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
// 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。
{182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
// 009909 但是因为还没到退休年龄,只能掰着指头捱日子。
{40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
// 009910 这几天雨水不断,人们恨不得待在家里不出门。
{262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52},
};
void usage(const char *binName) {
std::cerr << "Usage:" << std::endl
<< "\t" << binName << " <AM-model-path> <VOC-model-path> <sentences-index:1-10> <output-wav-path>" << std::endl;
}
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)");
DEFINE_string(front_conf, "./front.conf", "Front configuration file");
DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file");
DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file");
DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder");
DEFINE_string(cpu_thread, "1", "CPU thread numbers");
int main(int argc, char *argv[]) {
if (argc < 5) {
usage(argv[0]);
gflags::ParseCommandLineFlags(&argc, &argv, true);
PredictorInterface *predictor;
if (FLAGS_wav_bit_depth == "16") {
predictor = new Predictor<int16_t>();
} else if (FLAGS_wav_bit_depth == "32") {
predictor = new Predictor<float>();
} else {
LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
return -1;
}
const char *AMModelPath = argv[1];
const char *VOCModelPath = argv[2];
int sentencesIndex = atoi(argv[3]) - 1;
const char *outputWavPath = argv[4];
if (sentencesIndex < 0 || sentencesIndex >= sentencesToChoose.size()) {
std::cerr << "sentences-index out of range" << std::endl;
/////////////////////////// 前端:文本转音素 ///////////////////////////
// 实例化文本前端引擎
ppspeech::FrontEngineInterface *front_inst = nullptr;
front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf);
if ((!front_inst) || (front_inst->init())) {
LOG(ERROR) << "Creater tts engine failed!";
if (front_inst != nullptr) {
delete front_inst;
}
front_inst = nullptr;
return -1;
}
Predictor predictor;
if (!predictor.Init(AMModelPath, VOCModelPath, 1, "LITE_POWER_HIGH")) {
std::cerr << "predictor init failed" << std::endl;
std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence);
// 繁体转简体
std::wstring sentence_simp;
front_inst->Trand2Simp(ws_sentence, sentence_simp);
ws_sentence = sentence_simp;
std::string s_sentence;
std::vector<std::wstring> sentence_part;
std::vector<int> phoneids = {};
std::vector<int> toneids = {};
// 根据标点进行分句
LOG(INFO) << "Start to segment sentences by punctuation";
front_inst->SplitByPunc(ws_sentence, sentence_part);
LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
for(int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: " << ppspeech::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(sentence_part[i]);
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence;
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1;
}
}
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
/////////////////////////// 后端:音素转音频 ///////////////////////////
// WAV采样率必须与模型输出匹配
// 如果播放速度和音调异常,请修改采样率
// 常见采样率16000, 24000, 32000, 44100, 48000, 96000
const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);
// CPU线程数
const int cpuThreadNum = std::stol(FLAGS_cpu_thread);
// CPU电源模式
const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) {
LOG(ERROR) << "predictor init failed" << std::endl;
return -1;
}
if (!predictor.RunModel(sentencesToChoose[sentencesIndex])) {
std::cerr << "predictor run model failed" << std::endl;
std::vector<int64_t> phones(phoneids.size());
std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); });
if (!predictor->RunModel(phones)) {
LOG(ERROR) << "predictor run model failed" << std::endl;
return -1;
}
std::cout << "Inference time: " << predictor.GetInferenceTime() << " ms, "
<< "WAV size (without header): " << predictor.GetWavSize() << " bytes" << std::endl;
LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
<< "WAV size (without header): " << predictor->GetWavSize() << " bytes, "
<< "WAV duration: " << predictor->GetWavDuration() << " ms, "
<< "RTF: " << predictor->GetRTF() << std::endl;
if (!predictor.WriteWavToFile(outputWavPath)) {
std::cerr << "write wav file failed" << std::endl;
if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
LOG(ERROR) << "write wav file failed" << std::endl;
return -1;
}
delete predictor;
return 0;
}

@ -0,0 +1 @@
TTSCppFrontend/third-party

@ -0,0 +1,2 @@
build/
dict/

@ -0,0 +1,63 @@
cmake_minimum_required(VERSION 3.10)
project(paddlespeech_tts_cpp)
########## Global Options ##########
option(WITH_FRONT_DEMO "Build front demo" ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(ABSL_PROPAGATE_CXX_STD ON)
########## Dependencies ##########
set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/third-party/build/lib/pkgconfig:${CMAKE_SOURCE_DIR}/third-party/build/lib64/pkgconfig")
find_package(PkgConfig REQUIRED)
# It is hard to load xxx-config.cmake in a custom location, so use pkgconfig instead.
pkg_check_modules(ABSL REQUIRED absl_strings IMPORTED_TARGET)
pkg_check_modules(GFLAGS REQUIRED gflags IMPORTED_TARGET)
pkg_check_modules(GLOG REQUIRED libglog IMPORTED_TARGET)
# load header-only libraries
include_directories(
${CMAKE_SOURCE_DIR}/third-party/build/src/cppjieba/include
${CMAKE_SOURCE_DIR}/third-party/build/src/limonp/include
)
find_package(Threads REQUIRED)
########## paddlespeech_tts_front ##########
include_directories(src)
file(GLOB FRONT_SOURCES
./src/base/*.cpp
./src/front/*.cpp
)
add_library(paddlespeech_tts_front STATIC ${FRONT_SOURCES})
target_link_libraries(
paddlespeech_tts_front
PUBLIC
PkgConfig::GFLAGS
PkgConfig::GLOG
PkgConfig::ABSL
Threads::Threads
)
########## tts_front_demo ##########
if (WITH_FRONT_DEMO)
file(GLOB FRONT_DEMO_SOURCES front_demo/*.cpp)
add_executable(tts_front_demo ${FRONT_DEMO_SOURCES})
target_include_directories(tts_front_demo PRIVATE ./front_demo)
target_link_libraries(tts_front_demo PRIVATE paddlespeech_tts_front)
endif (WITH_FRONT_DEMO)

@ -0,0 +1,55 @@
# PaddleSpeech TTS CPP Frontend
A TTS frontend that implements text-to-phoneme conversion.
Currently it only supports Chinese, any English word will crash the demo.
## Install Build Tools
```
# Ubuntu
sudo apt install build-essential cmake pkg-config
# CentOS
sudo yum groupinstall "Development Tools"
sudo yum install cmake
```
If your cmake version is too old, you can go here to download a precompiled new version: https://cmake.org/download/
## Build
```
# Build with all CPU cores
./build.sh
# Build with 1 core
./build.sh -j1
```
Dependent libraries will be automatically downloaded to the `third-party/build` folder.
If the download speed is too slow, you can open [third-party/CMakeLists.txt](third-party/CMakeLists.txt) and modify `GIT_REPOSITORY` URLs.
## Download dictionary files
```
./download.sh
```
## Run
```
./run_front_demo.sh
./run_front_demo.sh --help
./run_front_demo.sh --sentence "这是语音合成服务的文本前端,用于将文本转换为音素序号数组。"
./run_front_demo.sh --front_conf ./front_demo/front.conf --sentence "你还需要一个语音合成后端才能将其转换为实际的声音。"
```
## Clean
```
./clean.sh
```
The folders `front_demo/dict`, `build` and `third-party/build` will be deleted.

@ -0,0 +1,20 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
cd ./third-party
mkdir -p build
cd build
cmake ..
if [ "$*" = "" ]; then
make -j$(nproc)
else
make "$@"
fi
echo "Done."

@ -0,0 +1,21 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
echo "************* Download & Build Dependencies *************"
./build-depends.sh "$@"
echo "************* Build Front Lib and Demo *************"
mkdir -p ./build
cd ./build
cmake ..
if [ "$*" = "" ]; then
make -j$(nproc)
else
make "$@"
fi
echo "Done."

@ -0,0 +1,10 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
rm -rf "./front_demo/dict"
rm -rf "./build"
rm -rf "./third-party/build"
echo "Done."

@ -0,0 +1,62 @@
#!/bin/bash
set -e
cd "$(dirname "$(realpath "$0")")"
download() {
file="$1"
url="$2"
md5="$3"
dir="$4"
cd "$dir"
if [ -f "$file" ] && [ "$(md5sum "$file" | awk '{ print $1 }')" = "$md5" ]; then
echo "File $file (MD5: $md5) has been downloaded."
else
echo "Downloading $file..."
wget -O "$file" "$url"
# MD5 verify
fileMd5="$(md5sum "$file" | awk '{ print $1 }')"
if [ "$fileMd5" == "$md5" ]; then
echo "File $file (MD5: $md5) has been downloaded."
else
echo "MD5 mismatch, file may be corrupt"
echo "$file MD5: $fileMd5, it should be $md5"
fi
fi
echo "Extracting $file..."
echo '-----------------------'
tar -vxf "$file"
echo '======================='
}
########################################
DIST_DIR="$PWD/front_demo/dict"
mkdir -p "$DIST_DIR"
download 'fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \
'7bf1bab1737375fa123c413eb429c573' \
"$DIST_DIR"
download 'speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \
'0b7754b21f324789aef469c61f4d5b8f' \
"$DIST_DIR"
download 'jieba.tar.gz' \
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/jieba.tar.gz' \
'6d30f426bd8c0025110a483f051315ca' \
"$DIST_DIR"
download 'tranditional_to_simplified.tar.gz' \
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/tranditional_to_simplified.tar.gz' \
'258f5b59d5ebfe96d02007ca1d274a7f' \
"$DIST_DIR"
echo "Done."

@ -0,0 +1,21 @@
# jieba conf
--jieba_dict_path=./front_demo/dict/jieba/jieba.dict.utf8
--jieba_hmm_path=./front_demo/dict/jieba/hmm_model.utf8
--jieba_user_dict_path=./front_demo/dict/jieba/user.dict.utf8
--jieba_idf_path=./front_demo/dict/jieba/idf.utf8
--jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
--seperate_tone=false
--word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
#--seperate_tone=true
#--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
# dict of tranditional_to_simplified
--trand2simpd_path=./front_demo/dict/tranditional_to_simplified/trand2simp.txt

@ -0,0 +1,65 @@
#include <string>
//#include "utils/dir_utils.h"
#include "front/front_interface.h"
#include <glog/logging.h>
#include <gflags/gflags.h>
#include <map>
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized");
DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
//DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
// 实例化文本前端引擎
ppspeech::FrontEngineInterface *front_inst = nullptr;
front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf);
if ((!front_inst) || (front_inst->init())) {
LOG(ERROR) << "Creater tts engine failed!";
if (front_inst != nullptr) {
delete front_inst;
}
front_inst = nullptr;
return -1;
}
std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence);
// 繁体转简体
std::wstring sentence_simp;
front_inst->Trand2Simp(ws_sentence, sentence_simp);
ws_sentence = sentence_simp;
std::string s_sentence;
std::vector<std::wstring> sentence_part;
std::vector<int> phoneids = {};
std::vector<int> toneids = {};
// 根据标点进行分句
LOG(INFO) << "Start to segment sentences by punctuation";
front_inst->SplitByPunc(ws_sentence, sentence_part);
LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence";
for(int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: " << ppspeech::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(sentence_part[i]);
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence;
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1;
}
}
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " ");
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " ");
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
return EXIT_SUCCESS;
}

@ -0,0 +1,87 @@
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
########################################################################
#
# Copyright 2021 liangyunming(liangyunming@baidu.com)
#
# Execute the script when PaddleSpeech has been installed
# PaddleSpeech: https://github.com/PaddlePaddle/PaddleSpeech
########################################################################
import argparse
import configparser
from paddlespeech.t2s.frontend.zh_frontend import Frontend
def get_phone(frontend, word, merge_sentences=True, print_info=False, robot=False, get_tone_ids=False):
phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot)
# Some optimizations
phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids)
#print(type(phones), phones)
#print(type(tones), tones)
return phones, tones
def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=False):
with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2:
for line in f1.readlines():
word = line.split(" ")[0]
phone, tone = get_phone(frontend, word, get_tone_ids=get_tone)
phone_str = ""
if tone:
assert(len(phone) == len(tone))
for i in range(len(tone)):
phone_tone = phone[i] + tone[i]
phone_str += (" " + phone_tone)
phone_str = phone_str.strip("sp0").strip(" ")
else:
for x in phone:
phone_str += (" " + x)
phone_str = phone_str.strip("sp").strip(" ")
print(phone_str)
f2.write(word + " " + phone_str + "\n")
print("Generate word2phone dict successfully.")
def main():
parser = argparse.ArgumentParser(
description="Generate dictionary")
parser.add_argument(
"--config", type=str, default="./config.ini", help="config file.")
parser.add_argument(
"--am_type", type=str, default="fastspeech2", help="fastspeech2 or speedyspeech")
args = parser.parse_args()
# Read config
cf = configparser.ConfigParser()
cf.read(args.config)
jieba_words_dict_file = cf.get("jieba", "jieba_words_dict") # get words dict
am_type = args.am_type
if(am_type == "fastspeech2"):
phone2id_dict_file = cf.get(am_type, "phone2id_dict")
word2phone_dict_file = cf.get(am_type, "word2phone_dict")
frontend = Frontend(phone_vocab_path=phone2id_dict_file)
print("frontend done!")
gen_word2phone_dict(frontend, jieba_words_dict_file, word2phone_dict_file, get_tone=False)
elif(am_type == "speedyspeech"):
phone2id_dict_file = cf.get(am_type, "phone2id_dict")
tone2id_dict_file = cf.get(am_type, "tone2id_dict")
word2phone_dict_file = cf.get(am_type, "word2phone_dict")
frontend = Frontend(phone_vocab_path=phone2id_dict_file, tone_vocab_path=tone2id_dict_file)
print("frontend done!")
gen_word2phone_dict(frontend, jieba_words_dict_file, word2phone_dict_file, get_tone=True)
else:
print("Please set correct am type, fastspeech2 or speedyspeech.")
if __name__ == "__main__":
main()

@ -0,0 +1,22 @@
#from parakeet.frontend.vocab import Vocab
PHONESFILE = "./dict/phones.txt"
PHONES_ID_FILE = "./dict/phonesid.dict"
TONESFILE = "./dict/tones.txt"
TONES_ID_FILE = "./dict/tonesid.dict"
def GenIdFile(file, idfile):
id = 2
with open(file, 'r') as f1, open(idfile, "w+") as f2:
f2.write("<pad> 0\n")
f2.write("<unk> 1\n")
for line in f1.readlines():
phone = line.strip()
print(phone + " " + str(id) + "\n")
f2.write(phone + " " + str(id) + "\n")
id += 1
if __name__ == "__main__":
GenIdFile(PHONESFILE, PHONES_ID_FILE)
GenIdFile(TONESFILE, TONES_ID_FILE)

@ -0,0 +1,37 @@
from pypinyin import lazy_pinyin, Style
import re
worddict = "./dict/jieba_part.dict.utf8"
newdict = "./dict/word_phones.dict"
def GenPhones(initials, finals, seperate=True):
phones = []
for c, v in zip(initials, finals):
if re.match(r'i\d', v):
if c in ['z', 'c', 's']:
v = re.sub('i', 'ii', v)
elif c in ['zh', 'ch', 'sh', 'r']:
v = re.sub('i', 'iii', v)
if c:
if seperate == True:
phones.append(c + '0')
elif seperate == False:
phones.append(c)
else:
print("Not sure whether phone and tone need to be separated")
if v:
phones.append(v)
return phones
with open(worddict, "r") as f1, open(newdict, "w+") as f2:
for line in f1.readlines():
word = line.split(" ")[0]
initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
phones = GenPhones(initials, finals, True)
temp = " ".join(phones)
f2.write(word + " " + temp + "\n")

@ -0,0 +1,7 @@
#!/bin/bash
set -e
set -x
cd "$(dirname "$(realpath "$0")")"
./build/tts_front_demo "$@"

@ -0,0 +1,18 @@
#include "base/type_conv.h"
namespace ppspeech {
// wstring to string
std::string wstring2utf8string(const std::wstring& str)
{
static std::wstring_convert<std::codecvt_utf8<wchar_t> > strCnv;
return strCnv.to_bytes(str);
}
// string to wstring
std::wstring utf8string2wstring(const std::string& str)
{
static std::wstring_convert< std::codecvt_utf8<wchar_t> > strCnv;
return strCnv.from_bytes(str);
}
}

@ -0,0 +1,18 @@
#ifndef BASE_TYPE_CONVC_H
#define BASE_TYPE_CONVC_H
#include <string>
#include <locale>
#include <codecvt>
namespace ppspeech {
// wstring to string
std::string wstring2utf8string(const std::wstring& str);
// string to wstring
std::wstring utf8string2wstring(const std::string& str);
}
#endif // BASE_TYPE_CONVC_H

@ -0,0 +1,933 @@
#include "front/front_interface.h"
namespace ppspeech {
int FrontEngineInterface::init() {
if (_initialed) {
return 0;
}
if (0 != ReadConfFile()) {
LOG(ERROR) << "Read front conf file failed";
return -1;
}
_jieba = new cppjieba::Jieba(_jieba_dict_path, _jieba_hmm_path, _jieba_user_dict_path,
_jieba_idf_path, _jieba_stop_word_path);
_punc = {"", "", "", "", "", "", "~", "",
",", ".", "?", "!", ":", ";", "/", "\\"};
_punc_omit = {"", "", "\"", "\""};
// 需要儿化音处理的词语
must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"};
not_erhua = {
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
"狗儿"
};
must_not_neural_tone_words = {"男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"};
// 需要轻声处理的词语
must_neural_tone_words = {
"麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨", "风筝",
"难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠", "钥匙", "里脊",
"里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么", "这个", "运气", "过去",
"软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号",
"认识", "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李", "行当",
"蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻",
"舒服", "舒坦", "舌头", "自在", "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂",
"胡萝", "胡琴", "胡同", "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆",
"老头", "老太", "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂",
"精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿",
"窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗", "砚台",
"码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算",
"白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨",
"琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼", "爽快",
"爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼", "火候", "漂亮", "滑溜",
"溜达", "温和", "清楚", "消息", "浪头", "活泼", "比方", "正经", "欺负", "模糊", "槟榔",
"棺材", "棒槌", "棉花", "核桃", "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事",
"木头", "木匠", "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾",
"收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼",
"抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实",
"扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么", "念头",
"念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼",
"干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫", "岁数",
"屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付", "寡妇", "家伙", "客气",
"实在", "官司", "学问", "学生", "字号", "嫁妆", "媳妇", "媒人", "婆家", "娘家", "委屈",
"姑娘", "姐夫", "妯娌", "妥当", "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方",
"大意", "大夫", "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴",
"嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦",
"咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝",
"叫唤", "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹",
"功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息",
"凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜", "使唤",
"佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", "人家", "亲戚", "亲家",
"交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气", "两口", "东西", "东家", "世故",
"不由", "不在", "下水", "下巴", "上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨",
"父亲", "母亲", "咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅",
"幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱",
"凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱",
"扫把", "惦记"
};
// 生成词典(词到音素的映射)
if (0 != GenDict(_word2phone_path, word_phone_map)) {
LOG(ERROR) << "Genarate word2phone dict failed";
return -1;
}
// 生成音素字典音素到音素id的映射
if (0 != GenDict(_phone2id_path, phone_id_map)) {
LOG(ERROR) << "Genarate phone2id dict failed";
return -1;
}
// 生成音调字典音调到音调id的映射
if (_seperate_tone == "true") {
if (0 != GenDict(_tone2id_path, tone_id_map)) {
LOG(ERROR) << "Genarate tone2id dict failed";
return -1;
}
}
// 生成繁简字典繁体到简体id的映射
if (0 != GenDict(_trand2simp_path, trand_simp_map)) {
LOG(ERROR) << "Genarate trand2simp dict failed";
return -1;
}
_initialed = true;
return 0;
}
int FrontEngineInterface::ReadConfFile() {
std::ifstream is(_conf_file.c_str(), std::ifstream::in);
if (!is.good()) {
LOG(ERROR) << "Cannot open config file: " << _conf_file;
return -1;
}
std::string line, key, value;
while (std::getline(is, line)) {
if (line.substr(0, 2) == "--") {
size_t pos = line.find_first_of("=", 0);
std::string key = line.substr(2, pos-2);
std::string value = line.substr(pos + 1);
conf_map[key] = value;
LOG(INFO) << "Key: " << key << "; Value: " << value;
}
}
// jieba conf path
_jieba_dict_path = conf_map["jieba_dict_path"];
_jieba_hmm_path = conf_map["jieba_hmm_path"];
_jieba_user_dict_path = conf_map["jieba_user_dict_path"];
_jieba_idf_path = conf_map["jieba_idf_path"];
_jieba_stop_word_path = conf_map["jieba_stop_word_path"];
// dict path
_seperate_tone = conf_map["seperate_tone"];
_word2phone_path = conf_map["word2phone_path"];
_phone2id_path = conf_map["phone2id_path"];
_tone2id_path = conf_map["tone2id_path"];
_trand2simp_path = conf_map["trand2simpd_path"];
return 0;
}
int FrontEngineInterface::Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp) {
//sentence_simp = sentence;
for(int i = 0; i < sentence.length(); i++) {
std::wstring temp(1, sentence[i]);
std::string sigle_word = ppspeech::wstring2utf8string(temp);
// 单个字是否在繁转简的字典里
if(trand_simp_map.find(sigle_word) == trand_simp_map.end()) {
sentence_simp += temp;
} else {
sentence_simp += (ppspeech::utf8string2wstring(trand_simp_map[sigle_word]));
}
}
return 0;
}
int FrontEngineInterface::GenDict(const std::string &dict_file, std::map<std::string, std::string> &map) {
std::ifstream is(dict_file.c_str(), std::ifstream::in);
if (!is.good()) {
LOG(ERROR) << "Cannot open dict file: " << dict_file;
return -1;
}
std::string line, key, value;
while (std::getline(is, line)) {
size_t pos = line.find_first_of(" ", 0);
key = line.substr(0, pos);
value = line.substr(pos + 1);
map[key] = value;
}
return 0;
}
int FrontEngineInterface::GetSegResult(std::vector<std::pair<std::string, std::string>> &seg,
std::vector<std::string> &seg_words) {
std::vector<std::pair<std::string, std::string>> ::iterator iter;
for(iter=seg.begin(); iter!=seg.end(); iter++) {
seg_words.push_back((*iter).first);
}
return 0;
}
int FrontEngineInterface::GetSentenceIds(const std::string &sentence, std::vector<int> &phoneids, std::vector<int> &toneids) {
std::vector<std::pair<std::string, std::string>> cut_result; //分词结果包含词和词性
if (0 != Cut(sentence, cut_result)) {
LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed";
return -1;
}
if (0 != GetWordsIds(cut_result, phoneids, toneids)) {
LOG(ERROR) << "Get words phoneids failed";
return -1;
}
return 0;
}
int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, std::string>> &cut_result, std::vector<int> &phoneids,
std::vector<int> &toneids) {
std::string word;
std::string pos;
std::vector<std::string> word_initials;
std::vector<std::string> word_finals;
std::string phone;
for(int i = 0; i < cut_result.size(); i++) {
word = cut_result[i].first;
pos = cut_result[i].second;
if (std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { // 非可忽略的标点
word_initials = {};
word_finals = {};
phone = "";
// 判断是否在标点符号集合中
if (std::find(_punc.begin(), _punc.end(), word) == _punc.end()) { // 文字
// 获取字词的声母韵母列表
if(0 != GetInitialsFinals(word, word_initials, word_finals)) {
LOG(ERROR) << "Genarate the word_initials and word_finals of " << word << " failed";
return -1;
}
// 对读音进行修改
if(0 != ModifyTone(word, pos, word_finals)) {
LOG(ERROR) << "Failed to modify tone.";
}
// 对儿化音进行修改
std::vector<std::vector<std::string>> new_initals_finals = MergeErhua(word_initials, word_finals, word, pos);
word_initials = new_initals_finals[0];
word_finals = new_initals_finals[1];
// 将声母和韵母合并成音素
assert(word_initials.size() == word_finals.size());
std::string temp_phone;
for(int j = 0; j < word_initials.size(); j++) {
if(word_initials[j] != "") {
temp_phone = word_initials[j] + " " + word_finals[j];
} else {
temp_phone = word_finals[j];
}
if(j == 0) {
phone += temp_phone;
} else {
phone += (" " + temp_phone);
}
}
} else { // 标点符号
if(_seperate_tone == "true") {
phone = "sp0"; // speedyspeech
} else {
phone = "sp"; // fastspeech2
}
}
// 音素到音素id
if(0 != Phone2Phoneid(phone, phoneids, toneids)) {
LOG(ERROR) << "Genarate the phone id of " << word << " failed";
return -1;
}
}
}
return 0;
}
int FrontEngineInterface::Cut(const std::string &sentence, std::vector<std::pair<std::string, std::string>> &cut_result) {
std::vector<std::pair<std::string, std::string>> cut_result_jieba;
// 结巴分词
_jieba->Tag(sentence, cut_result_jieba);
// 对分词后结果进行整合
if (0 != MergeforModify(cut_result_jieba, cut_result)) {
LOG(ERROR) << "Failed to modify for word segmentation result.";
return -1;
}
return 0;
}
int FrontEngineInterface::GetPhone(const std::string &word, std::string &phone) {
// 判断 word 在不在 词典里如果不在进行CutAll分词
if (word_phone_map.find(word) == word_phone_map.end()) {
std::vector<std::string> wordcut;
_jieba->CutAll(word, wordcut);
phone = word_phone_map[wordcut[0]];
for (int i = 1; i < wordcut.size(); i++) {
phone += (" " + word_phone_map[wordcut[i]]);
}
} else {
phone = word_phone_map[word];
}
return 0;
}
int FrontEngineInterface::Phone2Phoneid(const std::string &phone, std::vector<int> &phoneid, std::vector<int> &toneid) {
std::vector<std::string> phone_vec;
phone_vec = absl::StrSplit(phone, " ");
std::string temp_phone;
for(int i = 0; i < phone_vec.size(); i++) {
temp_phone = phone_vec[i];
if(_seperate_tone == "true") {
phoneid.push_back(atoi((phone_id_map[temp_phone.substr(0, temp_phone.length()-1)]).c_str()));
toneid.push_back(atoi((tone_id_map[temp_phone.substr(temp_phone.length()-1, temp_phone.length())]).c_str()));
}else {
phoneid.push_back(atoi((phone_id_map[temp_phone]).c_str()));
}
}
return 0;
}
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
bool FrontEngineInterface::AllToneThree(const std::vector<std::string> &finals) {
bool flags = true;
for(int i = 0; i < finals.size(); i++) {
if((int)finals[i].back() != 51) { //如果读音不为第三声
flags = false;
}
}
return flags;
}
// 判断词是否是叠词
bool FrontEngineInterface::IsReduplication(const std::string &word) {
bool flags = false;
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
int len = word_wstr.length();
if(len == 2 && word_wstr[0] == word_wstr[1]){
flags = true;
}
return flags;
}
// 获取每个字词的声母和韵母列表, word_initials 为声母列表word_finals 为韵母列表
int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector<std::string> &word_initials, std::vector<std::string> &word_finals) {
std::string phone;
GetPhone(word, phone); //获取字词对应的音素
std::vector<std::string> phone_vec = absl::StrSplit(phone, " ");
//获取韵母每个字的音素有1或者2个start为单个字音素的起始位置。
int start = 0;
while(start < phone_vec.size()) {
if(phone_vec[start] == "sp" || phone_vec[start] == "sp0") {
start += 1;
}
// 最后一位不是数字或者最后一位的数字是0均表示声母第二个是韵母
else if(isdigit(phone_vec[start].back()) == 0 || (int)phone_vec[start].back() == 48) {
word_initials.push_back(phone_vec[start]);
word_finals.push_back(phone_vec[start + 1]);
start += 2;
} else {
word_initials.push_back("");
word_finals.push_back(phone_vec[start]);
start += 1;
}
}
assert(word_finals.size() == ppspeech::utf8string2wstring(word).length() && word_finals.size() == word_initials.size());
return 0;
}
// 获取每个字词的韵母列表
int FrontEngineInterface::GetFinals(const std::string &word, std::vector<std::string> &word_finals) {
std::vector<std::string> word_initials;
if(0 != GetInitialsFinals(word, word_initials, word_finals)) {
LOG(ERROR) << "Failed to get word finals";
return -1;
}
return 0;
}
int FrontEngineInterface::Word2WordVec(const std::string &word, std::vector<std::wstring> &wordvec) {
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
for(int i = 0; i < word_wstr.length(); i++) {
std::wstring word_sigle(1, word_wstr[i]);
wordvec.push_back(word_sigle);
}
return 0;
}
// yuantian01解释把一个词再进行分词找到。例子小雨伞 --> 小 雨伞 或者 小雨 伞
int FrontEngineInterface::SplitWord(const std::string &word, std::vector<std::string> &new_word_vec) {
std::vector<std::string> word_vec;
std::string second_subword;
_jieba->CutForSearch(word, word_vec);
// 升序
std::sort(word_vec.begin(), word_vec.end(), [](std::string a, std::string b ) {return a.size() > b.size();});
std::string first_subword = word_vec[0]; // 提取长度最短的字符串
int first_begin_idx = word.find_first_of(first_subword);
if(first_begin_idx == 0) {
second_subword = word.substr(first_subword.length());
new_word_vec.push_back(first_subword);
new_word_vec.push_back(second_subword);
} else {
second_subword = word.substr(0, word.length() - first_subword.length());
new_word_vec.push_back(second_subword);
new_word_vec.push_back(first_subword);
}
return 0;
}
//example: 不 一起 --> 不一起
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result;
std::string word;
std::string pos;
std::string last_word = "";
for(int i = 0; i < seg_result.size(); i++) {
word = seg_result[i].first;
pos = seg_result[i].second;
if(last_word == "") {
word = last_word + word;
}
if(word != "") {
result.push_back(make_pair(word, pos));
}
last_word = word;
}
if(last_word == "") {
result.push_back(make_pair(last_word, "d"));
last_word = "";
}
return result;
}
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result_temp;
std::string word;
std::string pos;
// function 1 example: 听 一 听 --> 听一听
for(int i = 0; i < seg_result.size(); i++) {
word = seg_result[i].first;
pos = seg_result[i].second;
if((i - 1 >= 0) && (word == "") && (i + 1 < seg_result.size()) &&
(seg_result[i - 1].first == seg_result[i + 1].first) && seg_result[i - 1].second == "v") {
result_temp[i - 1].first = result_temp[i - 1].first + "" + result_temp[i - 1].first;
} else {
if((i - 2 >= 0) && (seg_result[i - 1].first == "") && (seg_result[i - 2].first == word) && (pos == "v")) {
continue;
} else{
result_temp.push_back(make_pair(word, pos));
}
}
}
// function 2 example: 一 你 --> 一你
std::vector<std::pair<std::string, std::string>> result = {};
for(int j = 0; j < result_temp.size(); j++) {
word = result_temp[j].first;
pos = result_temp[j].second;
if((result.size() != 0) && (result.back().first == "")) {
result.back().first = result.back().first + word;
} else {
result.push_back(make_pair(word, pos));
}
}
return result;
}
// example: 你 你 --> 你你
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeReduplication(std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result;
std::string word;
std::string pos;
for(int i = 0; i < seg_result.size(); i++) {
word = seg_result[i].first;
pos = seg_result[i].second;
if((result.size() != 0) && (word == result.back().first)) {
result.back().first = result.back().first + seg_result[i].first;
} else {
result.push_back(make_pair(word, pos));
}
}
return result;
}
// the first and the second words are all_tone_three
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThreeTones(std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result;
std::string word;
std::string pos;
std::vector<std::vector<std::string>> finals; //韵母数组
std::vector<std::string> word_final;
std::vector<bool> merge_last(seg_result.size(), false);
// 判断最后一个分词结果是不是标点,不看标点的声母韵母
int word_num = seg_result.size() - 1;
if(std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) == _punc.end()){ // 最后一个分词结果不是标点
word_num += 1;
}
// 获取韵母数组
for(int i = 0; i < word_num; i++) {
word_final = {};
word = seg_result[i].first;
pos = seg_result[i].second;
if(std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { // 非可忽略的标点,即文字
if(0 != GetFinals(word, word_final)) {
LOG(ERROR) << "Failed to get the final of word.";
}
}
finals.push_back(word_final);
}
assert(word_num == finals.size());
// 对第三声读音的字词分词结果进行处理
for(int i = 0; i < word_num; i++) {
word = seg_result[i].first;
pos = seg_result[i].second;
if(i - 1 >= 0 && AllToneThree(finals[i - 1]) && AllToneThree(finals[i]) && !merge_last[i - 1]) {
// if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if(!IsReduplication(seg_result[i - 1].first) &&
(ppspeech::utf8string2wstring(seg_result[i - 1].first)).length() + (ppspeech::utf8string2wstring(word)).length() <= 3) {
result.back().first = result.back().first + seg_result[i].first;
merge_last[i] = true;
} else {
result.push_back(make_pair(word, pos));
}
} else {
result.push_back(make_pair(word, pos));
}
}
//把标点的分词结果补上
if(word_num < seg_result.size()) {
result.push_back(make_pair(seg_result[word_num].first, seg_result[word_num].second));
}
return result;
}
// the last char of first word and the first char of second word is tone_three
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThreeTones2(std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result;
std::string word;
std::string pos;
std::vector<std::vector<std::string>> finals; //韵母数组
std::vector<std::string> word_final;
std::vector<bool> merge_last(seg_result.size(), false);
// 判断最后一个分词结果是不是标点
int word_num = seg_result.size() - 1;
if(std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) == _punc.end()){ // 最后一个分词结果不是标点
word_num += 1;
}
// 获取韵母数组
for(int i = 0; i < word_num; i++) {
word_final = {};
word = seg_result[i].first;
pos = seg_result[i].second;
// 如果是文字,则获取韵母,如果是可忽略的标点,例如引号,则跳过
if(std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) {
if(0 != GetFinals(word, word_final)) {
LOG(ERROR) << "Failed to get the final of word.";
}
}
finals.push_back(word_final);
}
assert(word_num == finals.size());
// 对第三声读音的字词分词结果进行处理
for(int i = 0; i < word_num; i++) {
word = seg_result[i].first;
pos = seg_result[i].second;
if(i - 1 >= 0 && !finals[i - 1].empty() && absl::EndsWith(finals[i - 1].back(), "3") == true &&
!finals[i].empty() && absl::EndsWith(finals[i].front(), "3") == true && !merge_last[i - 1]) {
// if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if(!IsReduplication(seg_result[i - 1].first) &&
(ppspeech::utf8string2wstring(seg_result[i - 1].first)).length() + ppspeech::utf8string2wstring(word).length() <= 3) {
result.back().first = result.back().first + seg_result[i].first;
merge_last[i] = true;
} else {
result.push_back(make_pair(word, pos));
}
} else {
result.push_back(make_pair(word, pos));
}
}
//把标点的分词结果补上
if(word_num < seg_result.size()) {
result.push_back(make_pair(seg_result[word_num].first, seg_result[word_num].second));
}
return result;
}
// example: 吃饭 儿 --> 吃饭儿
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result;
std::string word;
std::string pos;
for(int i = 0; i < seg_result.size(); i++) {
word = seg_result[i].first;
pos = seg_result[i].second;
if((i - 1 >= 0) && (word == "")){
result.back().first = result.back().first + seg_result[i].first;
} else {
result.push_back(make_pair(word, pos));
}
}
return result;
}
int FrontEngineInterface::MergeforModify(std::vector<std::pair<std::string, std::string>> &seg_word_type,
std::vector<std::pair<std::string, std::string>> &modify_seg_word_type) {
std::vector<std::string> seg_result;
GetSegResult(seg_word_type, seg_result);
LOG(INFO) << "Before merge, seg result is: " << limonp::Join(seg_result.begin(), seg_result.end(), "/");
modify_seg_word_type = MergeBu(seg_word_type);
modify_seg_word_type = Mergeyi(modify_seg_word_type);
modify_seg_word_type = MergeReduplication(modify_seg_word_type);
modify_seg_word_type = MergeThreeTones(modify_seg_word_type);
modify_seg_word_type = MergeThreeTones2(modify_seg_word_type);
modify_seg_word_type = MergeEr(modify_seg_word_type);
seg_result = {};
GetSegResult(modify_seg_word_type, seg_result);
LOG(INFO) << "After merge, seg result is: " << limonp::Join(seg_result.begin(), seg_result.end(), "/");
return 0;
}
int FrontEngineInterface::BuSandi(const std::string &word, std::vector<std::string> &finals) {
std::wstring bu = L"";
std::vector<std::wstring> wordvec;
// 一个词转成向量形式
if(0 != Word2WordVec(word, wordvec)) {
LOG(ERROR) << "Failed to get word vector";
return -1;
}
// e.g. 看不懂 b u4 --> b u5, 将韵母的最后一位替换成 5
if(wordvec.size() == 3 && wordvec[1] == bu) {
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "5");
} else {
// e.g. 不怕 b u4 --> b u2, 将韵母的最后一位替换成 2
for(int i = 0; i < wordvec.size(); i++) {
if(wordvec[i] == bu && i + 1 < wordvec.size() &&
absl::EndsWith(finals[i + 1], "4") == true) {
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "2");
}
}
}
return 0;
}
int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::string> &finals) {
std::wstring yi = L"";
std::vector<std::wstring> wordvec;
// 一个词转成向量形式
if(0 != Word2WordVec(word, wordvec)) {
LOG(ERROR) << "Failed to get word vector";
return -1;
}
//情况1"一" in number sequences, e.g. 一零零, 二一零
std::wstring num_wstr = L"零一二三四六七八九";
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
if(word_wstr.find(yi) != word_wstr.npos && wordvec.back() != yi) {
int flags = 0;
for(int j = 0; j < wordvec.size(); j++) {
if(num_wstr.find(wordvec[j]) == num_wstr.npos) {
flags = -1;
break;
}
}
if(flags == 0) {
return 0;
}
} else if(wordvec.size() == 3 && wordvec[1] == yi && wordvec[0] == wordvec[2]) {
// "一" between reduplication words shold be yi5, e.g. 看一看
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "5");
} else if(wordvec[0] == L"" && wordvec[1] == yi) { //以第一位开始
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "1");
} else {
for(int i = 0; i < wordvec.size(); i++) {
if(wordvec[i] == yi && i + 1 < wordvec.size()) {
if(absl::EndsWith(finals[i + 1], "4") == true) {
// "一" before tone4 should be yi2, e.g. 一段
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "2");
} else {
// "一" before non-tone4 should be yi4, e.g. 一天
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "4");
}
}
}
}
return 0;
}
int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::string &pos, std::vector<std::string> &finals) {
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
std::vector<std::wstring> wordvec;
// 一个词转成向量形式
if(0 != Word2WordVec(word, wordvec)) {
LOG(ERROR) << "Failed to get word vector";
return -1;
}
int word_num = wordvec.size();
assert(word_num == word_wstr.length());
// 情况1reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
for(int j = 0; j < wordvec.size(); j++) {
std::string inits = "nva";
if(j - 1 >= 0 && wordvec[j] == wordvec[j - 1] && inits.find(pos[0]) != inits.npos) {
finals[j] = finals[j].replace(finals[j].length() - 1, 1, "5");
}
}
// 情况2对下述词的处理
std::wstring yuqici = L"吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶";
std::wstring de = L"的地得";
std::wstring le = L"了着过";
std::vector<std::string> le_pos = {"ul", "uz", "ug"};
std::wstring men = L"们子";
std::vector<std::string> men_pos = {"r", "n"};
std::wstring weizhi = L"上下里";
std::vector<std::string> weizhi_pos = {"s", "l", "f"};
std::wstring dong = L"来去";
std::wstring fangxiang = L"上下进出回过起开";
std::wstring ge = L"";
std::wstring xiushi = L"几有两半多各整每做是零一二三四六七八九";
auto ge_idx = word_wstr.find_first_of(ge); // 出现“个”的第一个位置
if(word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
} else if(word_num >= 1 && de.find(wordvec.back()) != de.npos) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
} else if(word_num == 1 && le.find(wordvec[0]) != le.npos && find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
} else if(word_num > 1 && men.find(wordvec.back()) != men.npos && find(men_pos.begin(), men_pos.end(), pos) != men_pos.end()
&& find(must_not_neural_tone_words.begin(), must_not_neural_tone_words.end(), word) != must_not_neural_tone_words.end()) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
} else if(word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos && find(weizhi_pos.begin(), weizhi_pos.end(), pos) != weizhi_pos.end()) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
} else if(word_num > 1 && dong.find(wordvec.back()) != dong.npos && fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
}
// 情况3对“个”字前面带有修饰词的字词读音处理
else if((ge_idx != word_wstr.npos && ge_idx >= 1 && xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos)
|| word_wstr == ge) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
} else {
if(find(must_neural_tone_words.begin(), must_neural_tone_words.end(), word) != must_neural_tone_words.end()
|| (word_num >= 2 && find(must_neural_tone_words.begin(), must_neural_tone_words.end(), ppspeech::wstring2utf8string(word_wstr.substr(word_num - 2))) != must_neural_tone_words.end())) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
}
}
// 进行进一步分词,把长词切分更短些
std::vector<std::string> word_list;
if(0 != SplitWord(word, word_list)) {
LOG(ERROR) << "Failed to split word.";
return -1;
}
// 创建对应的 韵母列表
std::vector<std::vector<std::string>> finals_list;
std::vector<std::string> finals_temp;
finals_temp.assign(finals.begin(), finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length());
finals_list.push_back(finals_temp);
finals_temp.assign(finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(), finals.end());
finals_list.push_back(finals_temp);
finals = {};
for(int i = 0; i < word_list.size(); i++) {
std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]);
if((find(must_neural_tone_words.begin(), must_neural_tone_words.end(), word_list[i]) != must_neural_tone_words.end())
|| (temp_wstr.length() >= 2 && find(must_neural_tone_words.begin(), must_neural_tone_words.end(), ppspeech::wstring2utf8string(temp_wstr.substr(temp_wstr.length() - 2))) != must_neural_tone_words.end())) {
finals_list[i].back() = finals_list[i].back().replace(finals_list[i].back().length() - 1, 1, "5");
}
finals.insert(finals.end(), finals_list[i].begin(), finals_list[i].end());
}
return 0;
}
int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::string> &finals) {
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
std::vector<std::vector<std::string>> finals_list;
std::vector<std::string> finals_temp;
std::vector<std::wstring> wordvec;
// 一个词转成向量形式
if(0 != Word2WordVec(word, wordvec)) {
LOG(ERROR) << "Failed to get word vector";
return -1;
}
int word_num = wordvec.size();
assert(word_num == word_wstr.length());
if(word_num == 2 && AllToneThree(finals)) {
finals[0] = finals[0].replace(finals[0].length() - 1, 1, "2");
} else if(word_num == 3) {
// 进行进一步分词,把长词切分更短些
std::vector<std::string> word_list;
if(0 != SplitWord(word, word_list)) {
LOG(ERROR) << "Failed to split word.";
return -1;
}
if(AllToneThree(finals)) {
std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[0]);
//disyllabic + monosyllabic, e.g. 蒙古/包
if(temp_wstr.length() == 2) {
finals[0] = finals[0].replace(finals[0].length() - 1, 1, "2");
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
} else if(temp_wstr.length() == 1) { //monosyllabic + disyllabic, e.g. 纸/老虎
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
}
} else {
// 创建对应的 韵母列表
finals_temp = {};
finals_list = {};
finals_temp.assign(finals.begin(), finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length());
finals_list.push_back(finals_temp);
finals_temp.assign(finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(), finals.end());
finals_list.push_back(finals_temp);
finals = {};
for(int i = 0; i < finals_list.size(); i++) {
// e.g. 所有/人
if(AllToneThree(finals_list[i]) && finals_list[i].size() == 2) {
finals_list[i][0] = finals_list[i][0].replace(finals_list[i][0].length() - 1, 1, "2");
} else if(i == 1 && !(AllToneThree(finals_list[i])) && absl::EndsWith(finals_list[i][0], "3") == true
&& absl::EndsWith(finals_list[0].back(), "3") == true) {
finals_list[0].back() = finals_list[0].back().replace(finals_list[0].back().length() - 1, 1, "2");
}
}
finals.insert(finals.end(), finals_list[0].begin(), finals_list[0].end());
finals.insert(finals.end(), finals_list[1].begin(), finals_list[1].end());
}
} else if(word_num == 4) { //将成语拆分为两个长度为 2 的单词
// 创建对应的 韵母列表
finals_temp = {};
finals_list = {};
finals_temp.assign(finals.begin(), finals.begin() + 2);
finals_list.push_back(finals_temp);
finals_temp.assign(finals.begin() + 2, finals.end());
finals_list.push_back(finals_temp);
finals = {};
for(int j = 0; j < finals_list.size(); j++){
if(AllToneThree(finals_list[j])) {
finals_list[j][0] = finals_list[j][0].replace(finals_list[j][0].length() - 1, 1, "2");
}
finals.insert(finals.end(), finals_list[j].begin(), finals_list[j].end());
}
}
return 0;
}
int FrontEngineInterface::ModifyTone(const std::string &word, const std::string &pos, std::vector<std::string> &finals) {
if((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) ||
(0 != NeuralSandhi(word, pos, finals)) || (0 != ThreeSandhi(word,finals))) {
LOG(ERROR) << "Failed to modify tone of the word: " << word;
return -1;
}
return 0;
}
std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std::vector<std::string> &initials, const std::vector<std::string> &finals, const std::string &word, const std::string &pos) {
std::vector<std::string> new_initials = {};
std::vector<std::string> new_finals = {};
std::vector<std::vector<std::string>> new_initials_finals;
std::vector<std::string> specified_pos = {"a", "j", "nr"};
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
std::vector<std::wstring> wordvec;
// 一个词转成向量形式
if(0 != Word2WordVec(word, wordvec)) {
LOG(ERROR) << "Failed to get word vector";
}
int word_num = wordvec.size();
if((find(must_erhua.begin(), must_erhua.end(), word) == must_erhua.end()) &&
((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) || (find(specified_pos.begin(), specified_pos.end(), pos) != specified_pos.end()))) {
new_initials_finals.push_back(initials);
new_initials_finals.push_back(finals);
return new_initials_finals;
}
if(finals.size() != word_num) {
new_initials_finals.push_back(initials);
new_initials_finals.push_back(finals);
return new_initials_finals;
}
assert(finals.size() == word_num);
for(int i = 0; i < finals.size(); i++) {
if(i == finals.size() - 1 && wordvec[i] == L"" && (finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 &&
find(not_erhua.begin(), not_erhua.end(), ppspeech::wstring2utf8string(word_wstr.substr(word_wstr.length() - 2))) == not_erhua.end() && !new_finals.empty()) {
new_finals.back() = new_finals.back().substr(0, new_finals.back().length()-1) + "r" + new_finals.back().substr(new_finals.back().length()-1);
} else {
new_initials.push_back(initials[i]);
new_finals.push_back(finals[i]);
}
}
new_initials_finals.push_back(new_initials);
new_initials_finals.push_back(new_finals);
return new_initials_finals;
}
}

@ -0,0 +1,156 @@
#ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
#define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
#include <map>
#include <string>
#include <memory>
#include <fstream>
#include <glog/logging.h>
//#include "utils/dir_utils.h"
#include <cppjieba/Jieba.hpp>
#include "front/text_normalize.h"
#include "absl/strings/str_split.h"
namespace ppspeech {
class FrontEngineInterface : public TextNormalizer{
public:
FrontEngineInterface(std::string conf) : _conf_file(conf) {
TextNormalizer();
_jieba = nullptr;
_initialed = false;
init();
}
int init();
~FrontEngineInterface() {
}
// 读取配置文件
int ReadConfFile();
// 简体转繁体
int Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp);
// 生成字典
int GenDict(const std::string &file, std::map<std::string, std::string> &map);
// 由 词+词性的分词结果转为仅包含词的结果
int GetSegResult(std::vector<std::pair<std::string, std::string>> &seg, std::vector<std::string> &seg_words);
// 生成句子的音素音调id。如果音素和音调未分开则 toneids 为空fastspeech2反之则不为空(speedyspeech)
int GetSentenceIds(const std::string &sentence, std::vector<int> &phoneids, std::vector<int> &toneids);
// 根据分词结果获取词的音素音调id并对读音进行适当修改 (ModifyTone)。如果音素和音调未分开,则 toneids 为空fastspeech2反之则不为空(speedyspeech)
int GetWordsIds(const std::vector<std::pair<std::string, std::string>> &cut_result, std::vector<int> &phoneids, std::vector<int> &toneids);
// 结巴分词生成包含词和词性的分词结果,再对分词结果进行适当修改 (MergeforModify)
int Cut(const std::string &sentence, std::vector<std::pair<std::string, std::string>> &cut_result);
// 字词到音素的映射,查找字典
int GetPhone(const std::string &word, std::string &phone);
// 音素到音素id
int Phone2Phoneid(const std::string &phone, std::vector<int> &phoneid, std::vector<int> &toneids);
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
bool AllToneThree(const std::vector<std::string> &finals);
// 判断词是否是叠词
bool IsReduplication(const std::string &word);
// 获取每个字词的声母韵母列表
int GetInitialsFinals(const std::string &word, std::vector<std::string> &word_initials, std::vector<std::string> &word_finals);
// 获取每个字词的韵母列表
int GetFinals(const std::string &word, std::vector<std::string> &word_finals);
// 整个词转成向量形式,向量的每个元素对应词的一个字
int Word2WordVec(const std::string &word, std::vector<std::wstring> &wordvec);
// 将整个词重新进行 full cut分词后各个词会在词典中
int SplitWord(const std::string &word, std::vector<std::string> &fullcut_word);
// 对分词结果进行处理:对包含“不”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeBu(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对包含“一”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> Mergeyi(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对前后相同的两个字进行合并
std::vector<std::pair<std::string, std::string>> MergeReduplication(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对一个词和后一个词他们的读音均为第三声的两个词进行合并
std::vector<std::pair<std::string, std::string>> MergeThreeTones(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并
std::vector<std::pair<std::string, std::string>> MergeThreeTones2(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对包含“儿”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeEr(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理、修改
int MergeforModify(std::vector<std::pair<std::string, std::string>> &seg_result, std::vector<std::pair<std::string, std::string>> &merge_seg_result);
// 对包含“不”字的相关词音调进行修改
int BuSandi(const std::string &word, std::vector<std::string> &finals);
// 对包含“一”字的相关词音调进行修改
int YiSandhi(const std::string &word, std::vector<std::string> &finals);
// 对一些特殊词(包括量词,语助词等)的相关词音调进行修改
int NeuralSandhi(const std::string &word, const std::string &pos, std::vector<std::string> &finals);
// 对包含第三声的相关词音调进行修改
int ThreeSandhi(const std::string &word, std::vector<std::string> &finals);
// 对字词音调进行处理、修改
int ModifyTone(const std::string &word, const std::string &pos, std::vector<std::string> &finals);
// 对儿化音进行处理
std::vector<std::vector<std::string>> MergeErhua(const std::vector<std::string> &initials, const std::vector<std::string> &finals, const std::string &word, const std::string &pos);
private:
bool _initialed;
cppjieba::Jieba *_jieba;
std::vector<std::string> _punc;
std::vector<std::string> _punc_omit;
std::string _conf_file;
std::map<std::string, std::string> conf_map;
std::map<std::string, std::string> word_phone_map;
std::map<std::string, std::string> phone_id_map;
std::map<std::string, std::string> tone_id_map;
std::map<std::string, std::string> trand_simp_map;
std::string _jieba_dict_path;
std::string _jieba_hmm_path;
std::string _jieba_user_dict_path;
std::string _jieba_idf_path;
std::string _jieba_stop_word_path;
std::string _seperate_tone;
std::string _word2phone_path;
std::string _phone2id_path;
std::string _tone2id_path;
std::string _trand2simp_path;
std::vector<std::string> must_erhua;
std::vector<std::string> not_erhua;
std::vector<std::string> must_not_neural_tone_words;
std::vector<std::string> must_neural_tone_words;
};
}
#endif

@ -0,0 +1,462 @@
#include "front/text_normalize.h"
namespace ppspeech {
// 初始化 digits_map and unit_map
int TextNormalizer::InitMap() {
digits_map["0"] = "";
digits_map["1"] = "";
digits_map["2"] = "";
digits_map["3"] = "";
digits_map["4"] = "";
digits_map["5"] = "";
digits_map["6"] = "";
digits_map["7"] = "";
digits_map["8"] = "";
digits_map["9"] = "";
units_map[1] = "";
units_map[2] = "";
units_map[3] = "";
units_map[4] = "";
units_map[8] = "亿";
return 0;
}
// 替换
int TextNormalizer::Replace(std::wstring &sentence, const int &pos, const int &len, const std::wstring &repstr) {
// 删除原来的
sentence.erase(pos, len);
// 插入新的
sentence.insert(pos, repstr);
return 0;
}
// 根据标点符号切分句子
int TextNormalizer::SplitByPunc(const std::wstring &sentence, std::vector<std::wstring> &sentence_part) {
std::wstring temp = sentence;
std::wregex reg(L"[:,;。?!,;?!]");
std::wsmatch match;
while (std::regex_search (temp, match, reg)) {
sentence_part.push_back(temp.substr(0, match.position(0) + match.length(0)));
Replace(temp, 0, match.position(0) + match.length(0), L"");
}
// 如果最后没有标点符号
if(temp != L"") {
sentence_part.push_back(temp);
}
return 0;
}
//数字转文本10200 - > 一万零二百
std::string TextNormalizer::CreateTextValue(const std::string &num_str, bool use_zero) {
std::string num_lstrip = std::string(absl::StripPrefix(num_str, "0")).data();
int len = num_lstrip.length();
if(len == 0) {
return "";
} else if (len == 1) {
if(use_zero && (len < num_str.length())) {
return digits_map["0"] + digits_map[num_lstrip];
} else {
return digits_map[num_lstrip];
}
} else {
int largest_unit = 0; // 最大单位
std::string first_part;
std::string second_part;
if (len > 1 and len <= 2) {
largest_unit = 1;
} else if (len > 2 and len <= 3) {
largest_unit = 2;
} else if (len > 3 and len <= 4) {
largest_unit = 3;
} else if (len > 4 and len <= 8) {
largest_unit = 4;
} else if (len > 8) {
largest_unit = 8;
}
first_part = num_str.substr(0, num_str.length() - largest_unit);
second_part = num_str.substr(num_str.length() - largest_unit);
return CreateTextValue(first_part, use_zero) + units_map[largest_unit] + CreateTextValue(second_part, use_zero);
}
}
// 数字一个一个对应,可直接用于年份,电话,手机,
std::string TextNormalizer::SingleDigit2Text(const std::string &num_str, bool alt_one) {
std::string text = "";
if (alt_one) {
digits_map["1"] = "";
} else {
digits_map["1"] = "";
}
for (size_t i = 0; i < num_str.size(); i++) {
std::string num_int(1, num_str[i]);
if (digits_map.find(num_int) == digits_map.end()) {
LOG(ERROR) << "digits_map doesn't have key: " << num_int;
}
text += digits_map[num_int];
}
return text;
}
std::string TextNormalizer::SingleDigit2Text(const std::wstring &num, bool alt_one) {
std::string num_str = wstring2utf8string(num);
return SingleDigit2Text(num_str, alt_one);
}
// 数字整体对应,可直接用于月份,日期,数值整数部分
std::string TextNormalizer::MultiDigit2Text(const std::string &num_str, bool alt_one, bool use_zero) {
LOG(INFO) << "aaaaaaaaaaaaaaaa: " << alt_one << use_zero;
if (alt_one) {
digits_map["1"] = "";
} else {
digits_map["1"] = "";
}
std::wstring result = utf8string2wstring(CreateTextValue(num_str, use_zero));
std::wstring result_0(1, result[0]);
std::wstring result_1(1, result[1]);
// 一十八 --> 十八
if ((result_0 == utf8string2wstring(digits_map["1"])) && (result_1 == utf8string2wstring(units_map[1]))) {
return wstring2utf8string(result.substr(1,result.length()));
} else {
return wstring2utf8string(result);
}
}
std::string TextNormalizer::MultiDigit2Text(const std::wstring &num, bool alt_one, bool use_zero) {
std::string num_str = wstring2utf8string(num);
return MultiDigit2Text(num_str, alt_one, use_zero);
}
// 数字转文本,包括整数和小数
std::string TextNormalizer::Digits2Text(const std::string &num_str) {
std::string text;
std::vector<std::string> integer_decimal;
integer_decimal = absl::StrSplit(num_str, ".");
if(integer_decimal.size() == 1) { // 整数
text = MultiDigit2Text(integer_decimal[0]);
} else if(integer_decimal.size() == 2) { // 小数
if(integer_decimal[0] == "") { // 无整数的小数类型,例如:.22
text = "" + SingleDigit2Text(std::string(absl::StripSuffix(integer_decimal[1], "0")).data());
} else { // 常规小数类型例如12.34
text = MultiDigit2Text(integer_decimal[0]) + "" + \
SingleDigit2Text(std::string(absl::StripSuffix(integer_decimal[1], "0")).data());
}
} else {
return "The value does not conform to the numeric format";
}
return text;
}
std::string TextNormalizer::Digits2Text(const std::wstring &num) {
std::string num_str = wstring2utf8string(num);
return Digits2Text(num_str);
}
// 日期2021年8月18日 --> 二零二一年八月十八日
int TextNormalizer::ReData(std::wstring &sentence) {
std::wregex reg(L"(\\d{4}|\\d{2})年((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?");
std::wsmatch match;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
rep = "";
rep += SingleDigit2Text(match[1]) + "";
if(match[3] != L"") {
rep += MultiDigit2Text(match[3], false, false) + "";
}
if(match[5] != L"") {
rep += MultiDigit2Text(match[5], false, false) + wstring2utf8string(match[9]);
}
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// XX-XX-XX or XX/XX/XX 例如2021/08/18 --> 二零二一年八月十八日
int TextNormalizer::ReData2(std::wstring &sentence) {
std::wregex reg(L"(\\d{4})([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])");
std::wsmatch match;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
rep = "";
rep += (SingleDigit2Text(match[1]) + "");
rep += (MultiDigit2Text(match[3], false, false) + "");
rep += (MultiDigit2Text(match[4], false, false) + "");
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// XX:XX:XX 09:09:02 --> 九点零九分零二秒
int TextNormalizer::ReTime(std::wstring &sentence) {
std::wregex reg(L"([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?");
std::wsmatch match;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
rep = "";
rep += (MultiDigit2Text(match[1], false, false) + "");
if(absl::StartsWith(wstring2utf8string(match[2]), "0")) {
rep += "";
}
rep += (MultiDigit2Text(match[2]) + "");
if(absl::StartsWith(wstring2utf8string(match[4]), "0")) {
rep += "";
}
rep += (MultiDigit2Text(match[4]) + "");
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 温度,例如:-24.3℃ --> 零下二十四点三度
int TextNormalizer::ReTemperature(std::wstring &sentence) {
std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)(°C|℃|度|摄氏度)");
std::wsmatch match;
std::string rep;
std::string sign;
std::vector<std::string> integer_decimal;
std::string unit;
while (std::regex_search (sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
match[4] == L"摄氏度"? unit = "摄氏度" : unit = "";
rep = sign + Digits2Text(match[2]) + unit;
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 分数,例如: 1/3 --> 三分之一
int TextNormalizer::ReFrac(std::wstring &sentence) {
std::wregex reg(L"(-?)(\\d+)/(\\d+)");
std::wsmatch match;
std::string sign;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
rep = sign + MultiDigit2Text(match[3]) + "分之" + MultiDigit2Text(match[2]);
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 百分数例如45.5% --> 百分之四十五点五
int TextNormalizer::RePercentage(std::wstring &sentence) {
std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)%");
std::wsmatch match;
std::string sign;
std::string rep;
std::vector<std::string> integer_decimal;
while (std::regex_search (sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
rep = sign + "百分之" + Digits2Text(match[2]);
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 手机号码,例如:+86 18883862235 --> 八六幺八八八三八六二二三五
int TextNormalizer::ReMobilePhone(std::wstring &sentence) {
std::wregex reg(L"(\\d)?((\\+?86 ?)?1([38]\\d|5[0-35-9]|7[678]|9[89])\\d{8})(\\d)?");
std::wsmatch match;
std::string rep;
std::vector<std::string> country_phonenum;
while (std::regex_search (sentence, match, reg)) {
country_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "+");
rep = "";
for(int i = 0; i < country_phonenum.size(); i++) {
LOG(INFO) << country_phonenum[i];
rep += SingleDigit2Text(country_phonenum[i], true);
}
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 座机号码例如010-51093154 --> 零幺零五幺零九三幺五四
int TextNormalizer::RePhone(std::wstring &sentence) {
std::wregex reg(L"(\\d)?((0(10|2[1-3]|[3-9]\\d{2})-?)?[1-9]\\d{6,7})(\\d)?");
std::wsmatch match;
std::vector<std::string> zone_phonenum;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
rep = "";
zone_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "-");
for(int i = 0; i < zone_phonenum.size(); i ++) {
rep += SingleDigit2Text(zone_phonenum[i], true);
}
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 范围例如60~90 --> 六十到九十
int TextNormalizer::ReRange(std::wstring &sentence) {
std::wregex reg(L"((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))[-~]((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))");
std::wsmatch match;
std::string rep;
std::string sign1;
std::string sign2;
while (std::regex_search (sentence, match, reg)) {
rep = "";
match[2] == L"-" ? sign1 = "" : sign1 = "";
if(match[6] != L"") {
rep += sign1 + Digits2Text(match[6]) + "";
} else {
rep += sign1 + Digits2Text(match[3]) + "";
}
match[9] == L"-" ? sign2 = "" : sign2 = "";
if(match[13] != L"") {
rep += sign2 + Digits2Text(match[13]);
} else {
rep += sign2 + Digits2Text(match[10]);
}
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 带负号的整数,例如:-10 --> 负十
int TextNormalizer::ReInterger(std::wstring &sentence) {
std::wregex reg(L"(-)(\\d+)");
std::wsmatch match;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
rep = "" + MultiDigit2Text(match[2]);
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 纯小数
int TextNormalizer::ReDecimalNum(std::wstring &sentence) {
std::wregex reg(L"(-?)((\\d+)(\\.\\d+))|(\\.(\\d+))");
std::wsmatch match;
std::string sign;
std::string rep;
//std::vector<std::string> integer_decimal;
while (std::regex_search (sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
if(match[5] != L"") {
rep = sign + Digits2Text(match[5]);
} else {
rep = sign + Digits2Text(match[2]);
}
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 正整数 + 量词
int TextNormalizer::RePositiveQuantifiers(std::wstring &sentence) {
std::wstring common_quantifiers = L"(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲| \
||||||||||||||||||||||||||||||||||线|||||| \
|||||||||||||||||||(||)|||()||||||||||(||| \
|)|||||||||||||||||||||||||||||||||||||| \
||||||||||||||||||||||||||||||||||||(亿||| \
||)|(亿|||||||)|(亿||||||)|||)";
std::wregex reg(L"(\\d+)([多余几])?" + common_quantifiers);
std::wsmatch match;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
rep = MultiDigit2Text(match[1]);
Replace(sentence, match.position(1), match.length(1), utf8string2wstring(rep));
}
return 0;
}
// 编号类数字,例如: 89757 --> 八九七五七
int TextNormalizer::ReDefalutNum(std::wstring &sentence) {
std::wregex reg(L"\\d{3}\\d*");
std::wsmatch match;
while (std::regex_search (sentence, match, reg)) {
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(SingleDigit2Text(match[0])));
}
return 0;
}
int TextNormalizer::ReNumber(std::wstring &sentence) {
std::wregex reg(L"(-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+))");
std::wsmatch match;
std::string sign;
std::string rep;
while (std::regex_search (sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
if(match[5] != L"") {
rep = sign + Digits2Text(match[5]);
} else {
rep = sign + Digits2Text(match[2]);
}
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep));
}
return 0;
}
// 整体正则,按顺序
int TextNormalizer::SentenceNormalize(std::wstring &sentence) {
ReData(sentence);
ReData2(sentence);
ReTime(sentence);
ReTemperature(sentence);
ReFrac(sentence);
RePercentage(sentence);
ReMobilePhone(sentence);
RePhone(sentence);
ReRange(sentence);
ReInterger(sentence);
ReDecimalNum(sentence);
RePositiveQuantifiers(sentence);
ReDefalutNum(sentence);
ReNumber(sentence);
return 0;
}
}

@ -0,0 +1,62 @@
#ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
#define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
#include <map>
#include <regex>
#include <string>
#include <codecvt>
#include <glog/logging.h>
#include "absl/strings/str_split.h"
#include "absl/strings/strip.h"
#include "base/type_conv.h"
namespace ppspeech {
class TextNormalizer {
public:
TextNormalizer() {
InitMap();
}
~TextNormalizer() {
}
int InitMap();
int Replace(std::wstring &sentence, const int &pos, const int &len, const std::wstring &repstr);
int SplitByPunc(const std::wstring &sentence, std::vector<std::wstring> &sentence_part);
std::string CreateTextValue(const std::string &num, bool use_zero=true);
std::string SingleDigit2Text(const std::string &num_str, bool alt_one = false);
std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false);
std::string MultiDigit2Text(const std::string &num_str, bool alt_one = false, bool use_zero = true);
std::string MultiDigit2Text(const std::wstring &num, bool alt_one = false, bool use_zero = true);
std::string Digits2Text(const std::string &num_str);
std::string Digits2Text(const std::wstring &num);
int ReData(std::wstring &sentence);
int ReData2(std::wstring &sentence);
int ReTime(std::wstring &sentence);
int ReTemperature(std::wstring &sentence);
int ReFrac(std::wstring &sentence);
int RePercentage(std::wstring &sentence);
int ReMobilePhone(std::wstring &sentence);
int RePhone(std::wstring &sentence);
int ReRange(std::wstring &sentence);
int ReInterger(std::wstring &sentence);
int ReDecimalNum(std::wstring &sentence);
int RePositiveQuantifiers(std::wstring &sentence);
int ReDefalutNum(std::wstring &sentence);
int ReNumber(std::wstring &sentence);
int SentenceNormalize(std::wstring &sentence);
private:
std::map<std::string, std::string> digits_map;
std::map<int, std::string> units_map;
};
}
#endif

@ -0,0 +1,64 @@
cmake_minimum_required(VERSION 3.10)
project(tts_third_party_libs)
include(ExternalProject)
# gflags
ExternalProject_Add(gflags
GIT_REPOSITORY https://github.com/gflags/gflags.git
GIT_TAG v2.2.2
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_STATIC_LIBS=OFF
-DBUILD_SHARED_LIBS=ON
)
# glog
ExternalProject_Add(
glog
GIT_REPOSITORY https://github.com/google/glog.git
GIT_TAG v0.6.0
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
DEPENDS gflags
)
# abseil
ExternalProject_Add(
abseil
GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
GIT_TAG 20230125.1
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DABSL_PROPAGATE_CXX_STD=ON
)
# cppjieba (header-only)
ExternalProject_Add(
cppjieba
GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
GIT_TAG v5.0.3
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
# limonp (header-only)
ExternalProject_Add(
limonp
GIT_REPOSITORY https://github.com/yanyiwu/limonp.git
GIT_TAG v0.6.6
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
Loading…
Cancel
Save