Merge branch 'develop' into align

pull/629/head
Hui Zhang 4 years ago
commit d05ae8eeb0

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

@ -272,8 +272,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
infer_model,
input_spec=[
paddle.static.InputSpec(
shape=[None, feat_dim, None],
dtype='float32'), # audio, [B,D,T]
shape=[None, None, feat_dim],
dtype='float32'), # audio, [B,T,D]
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
])

@ -15,7 +15,7 @@ from paddle import nn
from paddle.nn import functional as F
from deepspeech.modules.activation import brelu
from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
@ -111,8 +111,10 @@ class ConvBn(nn.Layer):
) // self.stride[1] + 1
# reset padding part to 0
masks = sequence_mask(x_len) #[B, T]
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x)
x = x.multiply(masks)
return x, x_len

@ -18,40 +18,12 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = [
'sequence_mask', "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
"make_pad_mask", "make_non_pad_mask", "subsequent_mask",
"subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
"mask_finished_preds"
]
def sequence_mask(x_len, max_len=None, dtype='float32'):
"""batch sequence mask.
Args:
x_len ([paddle.Tensor]): xs lenght, [B]
max_len ([type], optional): max sequence length. Defaults to None.
dtype (str, optional): mask data type. Defaults to 'float32'.
Returns:
paddle.Tensor: [B, Tmax]
Examples:
>>> sequence_mask([2, 4])
[[1., 1., 0., 0.],
[1., 1., 1., 1.]]
"""
# (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
# assert x_len.dim() == 1, (x_len.dim(), x_len)
max_len = max_len or x_len.max()
x_len = paddle.unsqueeze(x_len, -1)
row_vector = paddle.arange(max_len)
# TODO(Hui Zhang): fix this bug
#mask = row_vector < x_len
mask = row_vector > x_len # a bug, broadcast 的时候出错了
mask = paddle.cast(mask, dtype)
return mask
def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
"""Make mask tensor containing indices of padded part.
See description of make_non_pad_mask.
@ -66,7 +38,8 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]]
"""
assert lengths.dim() == 1
# (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
# assert lengths.dim() == 1
batch_size = int(lengths.shape[0])
max_len = int(lengths.max())
seq_range = paddle.arange(0, max_len, dtype=paddle.int64)

@ -19,7 +19,7 @@ from paddle.nn import functional as F
from paddle.nn import initializer as I
from deepspeech.modules.activation import brelu
from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
@ -306,7 +306,9 @@ class RNNStack(nn.Layer):
"""
for i, rnn in enumerate(self.rnn_stacks):
x, x_len = rnn(x, x_len)
masks = sequence_mask(x_len) #[B, T]
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x)
x = x.multiply(masks)
return x, x_len

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

@ -1,101 +0,0 @@
# ASR Text Backend
1. [Text Segmentation](text_front_end#text segmentation)
2. Text Corrector
3. Add Punctuation
4. Text Filter
## Text Corrector
* [pycorrector](https://github.com/shibing624/pycorrector)
本项目重点解决其中的谐音、混淆音、形似字错误、中文拼音全拼、语法错误带来的纠错任务。PS[网友源码解读](https://zhuanlan.zhihu.com/p/138981644)
* DeepCorrection [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d) [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f)
### Question
中文文本纠错任务,常见错误类型包括:
- 谐音字词,如 配副眼睛-配副眼镜
- 混淆音字词,如 流浪织女-牛郎织女
- 字词顺序颠倒,如 伍迪艾伦-艾伦伍迪
- 字词补全,如 爱有天意-假如爱有天意
- 形似字错误,如 高梁-高粱
- 中文拼音全拼,如 xingfu-幸福
- 中文拼音缩写,如 sz-深圳
- 语法错误,如 想象难以-难以想象
当然,针对不同业务场景,这些问题并不一定全部存在。
比如输入法中需要处理前四种,搜索引擎需要处理所有类型,语音识别后文本纠错只需要处理前两种, 其中'形似字错误'主要针对五笔或者笔画手写输入等。
### Solution
#### 规则的解决思路
1. 中文纠错分为两步走,第一步是错误检测,第二步是错误纠正;
2. 错误检测部分先通过结巴中文分词器切词,由于句子中含有错别字,所以切词结果往往会有切分错误的情况,这样从字粒度和词粒度两方面检测错误, 整合这两种粒度的疑似错误结果,形成疑似错误位置候选集;
3. 错误纠正部分,是遍历所有的疑似错误位置,并使用音似、形似词典替换错误位置的词,然后通过语言模型计算句子困惑度,对所有候选集结果比较并排序,得到最优纠正词。
#### 深度模型的解决思路
1. 端到端的深度模型可以避免人工提取特征减少人工工作量RNN序列模型对文本任务拟合能力强rnn_attention在英文文本纠错比赛中取得第一名成绩证明应用效果不错
2. CRF会计算全局最优输出节点的条件概率对句子中特定错误类型的检测会根据整句话判定该错误阿里参赛2016中文语法纠错任务并取得第一名证明应用效果不错
3. Seq2Seq模型是使用Encoder-Decoder结构解决序列转换问题目前在序列转换任务中如机器翻译、对话生成、文本摘要、图像描述使用最广泛、效果最好的模型之一
4. BERT/ELECTRA/ERNIE/MacBERT等预训练模型强大的语言表征能力对NLP界带来翻天覆地的改变海量的训练数据拟合的语言模型效果无与伦比基于其MASK掩码的特征可以简单改造预训练模型用于纠错加上fine-tune效果轻松达到最优。
### 规则检测方法
- kenlmkenlm统计语言模型工具规则方法语言模型纠错利用混淆集扩展性强
#### 错误检测
- 字粒度语言模型困惑度ppl检测某字的似然概率值低于句子文本平均值则判定该字是疑似错别字的概率大。
- 词粒度:切词后不在词典中的词是疑似错词的概率大。
#### 错误纠正
- 通过错误检测定位所有疑似错误后,取所有疑似错字的音似、形似候选词,
- 使用候选词替换,基于语言模型得到类似翻译模型的候选排序结果,得到最优纠正词。
#### 思考
1. 现在的处理手段,在词粒度的错误召回还不错,但错误纠正的准确率还有待提高,更多优质的纠错集及纠错词库会有提升。
2. 另外现在的文本错误不再局限于字词粒度上的拼写错误需要提高中文语法错误检测CGED, Chinese Grammar Error Diagnosis及纠正能力。
### Reference
* https://github.com/shibing624/pycorrector
* [基于文法模型的中文纠错系统](https://blog.csdn.net/mingzai624/article/details/82390382)
* [Norvigs spelling corrector](http://norvig.com/spell-correct.html)
* [Chinese Spelling Error Detection and Correction Based on Language Model, Pronunciation, and Shape[Yu, 2013]](http://www.aclweb.org/anthology/W/W14/W14-6835.pdf)
* [Chinese Spelling Checker Based on Statistical Machine Translation[Chiu, 2013]](http://www.aclweb.org/anthology/O/O13/O13-1005.pdf)
* [Chinese Word Spelling Correction Based on Rule Induction[yeh, 2014]](http://aclweb.org/anthology/W14-6822)
* [Neural Language Correction with Character-Based Attention[Ziang Xie, 2016]](https://arxiv.org/pdf/1603.09727.pdf)
* [Chinese Spelling Check System Based on Tri-gram Model[Qiang Huang, 2014]](http://www.anthology.aclweb.org/W/W14/W14-6827.pdf)
* [Neural Abstractive Text Summarization with Sequence-to-Sequence Models[Tian Shi, 2018]](https://arxiv.org/abs/1812.02303)
* [基于深度学习的中文文本自动校对研究与实现[杨宗霖, 2019]](https://github.com/shibing624/pycorrector/blob/master/docs/基于深度学习的中文文本自动校对研究与实现.pdf)
* [A Sequence to Sequence Learning for Chinese Grammatical Error Correction[Hongkai Ren, 2018]](https://link.springer.com/chapter/10.1007/978-3-319-99501-4_36)
* [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB)
* [Revisiting Pre-trained Models for Chinese Natural Language Processing](https://arxiv.org/abs/2004.13922)
## Add Punctuation
* DeepCorrection [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d) [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f)
## Text Filter
* 敏感词(黄暴、涉政、违法违禁等)

@ -1,145 +0,0 @@
# Speech Synthesis
* [爱丁堡大学公开课](http://speech.zone/courses/speech-synthesis)
* ### 推荐书籍
1. Daniel Jurafsky and James H. Martin, Speech and language processing: An introduction to natural language processing, computational linguistics, and speech recognition. 这本书之前在学习语音识别的时候也经常翻阅。 推荐阅读章节: Ch 7 & Ch 8 (都读过啦~)
2. Xuedong Huang, Alex Aceoro, Hsiao-Wuen Hon, Spoken Language Processing: A guide to theory, algorithm, and system development, Prentice Hall, 2011 这本书的三位作者都是大佬,本书推荐阅读 Ch2, Ch5, Ch6, Part IV: Text-to-Speech Systems. 学习一下基础知识点,如信号处理等
3. Paul Taylor, Text-to-Speech Synthesis, Cambridege University Press, 2009. 比较系统地讲述了神经网络之前的语音合成系统。
### 语音合成
现代语音合成主要包含文本分析和语音合成
#### 文本分析
文本分析主要分为
- **断句** : 怎么判断一句句子结束了,单纯用句号来切分并不靠谱,比如 B.C., Dr.J.M.,’。。。’
- **文本归一化** : 根据上下文消除一些词的读法常见有数字的读法”In 1950, he went to” -> “nineteen fifty”, “There are 1950 sheep.” => “one thousand and fifty”, “The code number is 1950” -> “one nine five zero”.
- **分词** : 将句子分成一个个的词,对于中文这种没有空格作为天然分隔符的语言是需要分词单元的。
- **词性分析** : 将分好的词中的每个词进行标注,”动词,名词,形容词,…”
- **注音** : 有些词的读音在不同上下文中发音是不一样的,比如 live -> /l ih v/ or /l ay v/ 中文中也有多音字的现象,所以需要进行标注。
- **韵律分析** : 声调,重读,韵律边界
#### 语音合成方法
**波形拼接** : 将各种语音单元拼接起来,需要考虑目标代价(目标语音单元和候选的语音单元匹配度)和连接代价(相邻语音单元之间的流畅度)
**基于轨迹指导的拼接合成**
**统计参数合成** : 帧级建模包括时长模型(音素序列->帧级文本特征)和声学模型(帧级文本特征->帧级语音输出)。主要方法是基于HMM 的 SPSS (Statistical Parametric Speech Synthesis), 可以用的工具包 HTS。
**神经网络合成方法** : 目前许多商用场景下已经部署了基于神经网络的语音合成模型。目前基于神经网络的方法还不是纯端到端的,分为两个部分,输入文本类信息(音素,时长等)经过神经网络得到输出特征(LF0, UV, 谱特征, bap), 接着将这些特征放到声码器(vocoder) 中得到对应的语音波形。主流方法是 Tactron, Tactron2, 注意力机制Transformer。正在朝着基于序列到序列的语音合成纯端到端的语音合成方向发展。
**声码器**的总结如下:
| **模型类型** | **模型** | **合成语音质量** | **效率** |
| ------------ | ----------------- | ---------------- | ---------- |
| AR | WaveNet | 非常好 | 非常差 |
| AR | WaveRNN | 非常好 | 中等 |
| AR | Multiband WaveRNN | 非常好 | 中等 |
| AR | LPCNET | 非常好 | 挺好的 |
| Non-AR | Parallel WaveNet | 非常好 | 还不错 |
| Non-AR | WaveGlow | 非常好 | 还不错 |
| Non-AR | FlowWaveNet | 非常好 | 还不错 |
| GAN | ParallelWaveGAN | 非常好 | 挺好的 |
| GAN | MelGAN | 挺好的 | 非常好 |
| GAN | MB-MelGAN | 非常好 | 非常非常好 |
从上面表格中可以看到基于神经网络的声码器效果都挺好的主要需要优化的就是生成的速度。出现了利用GAN的声码器之后推理速度也极大的提高了。
### 高阶话题
* 基于注意力机制的序列要序列的模型框架稳定性问题: 长句、连读、丢字、漏字、重复
* 小样本学习(few shots & one shot)
* 情感/表现力/可控性(句子内部细粒度控制,风格建模)
* 纯端到端
* 抗噪
* 语音转换
* 歌唱合成
### 语音合成评估
文本分析模块可以有比较客观的指标precision, recall, fscore 之类的。
生成的语音质量评估方法有:和参考样例之间的距离度量(DTW), 谱包络(MCD), F0轮廓V/UV Error 时长 (Duration RMSE)。
主观指标包括 MOSCMOS, AB Best, MUSHRA。
### 语音合成数据集
数据质量非常重要
中文: 标贝DB-1女性说话1万句10.3小时
英文: VCTK, LJSpeech, LibriSpeech, LibriTTS
### 非端到端的语音合
目前非端到端的语音合成算法有两种,
1)**参数语音合成方法**,其中*声学模型*包括基于隐马尔可夫(HMM)的统计参数语音合成和基于神经网络(NN)的统计参数语音合成,而*声码器*包括基于源-滤波器的声码器和基于NN的声码器
2) **单元拼接语音合成方法** 简单地理解是有一个很大的语音库包含了许多词/音素的发音,用一些方法将各个单元拼接起来。
#### 声学特征
传统声学模型这里的声学特征主要包括 MGC-梅尔生成倒谱, MCEP-梅尔倒谱, LSP-线谱对这些普参数加上激励参数如基频F0就是需要拟合的声学特征。而我们的音频通常都是一个个的采样点谱参数+激励参数是可以还原到音频采样点的。
常用的工具Straight, World, SPTK, [HTS](http://hts.sp.nitech.ac.jp/), [Pysptk](https://github.com/r9y9/pysptk)。
#### 基于HMM的统计参数语音合成
HMM 应用到 TTS 这里和 ASR 还是有些区别的。主要参考的论文是 [An Introduction to HMM-Based Speech Synthesis](https://www.researchgate.net/publication/265398553_An_Introduction_to_HMM-Based_Speech_Synthesis):
#### 基于 NN 的参数语音合成
基于 NN 的参数语音合成主要依赖时长模型和声学模型。
### 风格化和个性化语音合成
风格化和个性化语音合成,难点有三个方面:
- 风格化: 需要合成丰富且可控的语音,包括语速、停顿、重音、情感等。
- 个性化: 要求我们利用多说话人建模技术及说话人自适应技术,在少量录音室或非录音室数据的条件下,为某一新说话人定制语音合成模型。
- 迁移学习: 在只有一种语言的训练数据集下让说话人说另一种语言或者让说话人学习另一说话人的风格。迁移学习使我们能够利用额外的数据进行知识迁移,进而完成一些特定任务。
建模和评估比较困难、数据集标注成本高,标注人员对风格问题容易产生分歧、模型缺乏控制合成语音风格的能力。
## Reference
* https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/09/26/TTS/
* https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/25/TTS2/
* https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/12/04/TTS6/

@ -1,8 +1,9 @@
# Aishell-1
## Deepspeech2
| Model | release | Config | Test set | CER |
| --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1 | conf/deepspeech2.yaml | test | 0.078671 |
| DeepSpeech2 | 2.0 | conf/deepspeech2.yaml | test | 0.078977 |
| DeepSpeech2 | 1.8.5 | - | test | 0.080447 |
| Model | release | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2 | 1.8.5 | - | test | - | 0.080447 |

@ -1,23 +0,0 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -24,7 +24,7 @@ data:
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: False
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
@ -76,7 +76,7 @@ model:
training:
n_epoch: 240
accum_grad: 2
global_grad_clip: 5.0
global_grad_clip: 3.0
optim: adam
optim_conf:
lr: 0.002

@ -15,6 +15,10 @@ fi
config_path=$1
ckpt_prefix=$2
ckpt_name=$(basename ${ckpt_prefxi})
mkdir -p exp
# download language model
#bash local/download_lm_ch.sh
#if [ $? -ne 0 ]; then
@ -25,11 +29,13 @@ ckpt_prefix=$2
for type in attention ctc_greedy_search; do
echo "decoding ${type}"
batch_size=64
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
@ -42,11 +48,13 @@ done
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}

@ -0,0 +1,2 @@
data
exp

@ -0,0 +1,85 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
#This code was written by Franki Allegra in February 2020.
import json
import sys
# usage: bin ccedict dump.json
with open(sys.argv[1], 'rt') as file:
text = file.read()
lines = text.split('\n')
dict_lines = list(lines)
def parse_line(line):
parsed = {}
if line == '':
dict_lines.remove(line)
return 0
if line.startswith('#'):
return 0
if line.startswith('%'):
return 0
line = line.rstrip('/')
line = line.split('/')
if len(line) <= 1:
return 0
english = line[1]
char_and_pinyin = line[0].split('[')
characters = char_and_pinyin[0]
characters = characters.split()
traditional = characters[0]
simplified = characters[1]
pinyin = char_and_pinyin[1]
pinyin = pinyin.rstrip()
pinyin = pinyin.rstrip("]")
parsed['traditional'] = traditional
parsed['simplified'] = simplified
parsed['pinyin'] = pinyin
parsed['english'] = english
list_of_dicts.append(parsed)
def remove_surnames():
for x in range(len(list_of_dicts) - 1, -1, -1):
if "surname " in list_of_dicts[x]['english']:
if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
'traditional']:
list_of_dicts.pop(x)
def main():
#make each line into a dictionary
print("Parsing dictionary . . .")
for line in dict_lines:
parse_line(line)
#remove entries for surnames from the data (optional):
print("Removing Surnames . . .")
remove_surnames()
print("Saving to database (this may take a few minutes) . . .")
with open(sys.argv[2], 'wt') as fout:
for one_dict in list_of_dicts:
json_str = json.dumps(one_dict)
fout.write(json_str + "\n")
print('Done!')
list_of_dicts = []
parsed_dict = main()

@ -0,0 +1,10 @@
export MAIN_ROOT=${PWD}/../../
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}

@ -0,0 +1,39 @@
#!/bin/bash
# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
# The word dictionary of this website is based on CC-CEDICT.
# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
set -e
source path.sh
stage=-1
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
cedict=cedict_1_0_ts_utf-8_mdbg.zip
mkdir -p data
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
pushd data
unzip ${cedict}
popd
fi
mkdir -p exp
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
cp data/cedict_ts.u8 exp/cedict
python3 local/parser.py exp/cedict exp/cedict.json
fi

@ -0,0 +1,2 @@
data
exp

@ -0,0 +1,5 @@
# Download Baker dataset
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
Download URL https://test.data-baker.com/#/data/index/source.

@ -0,0 +1,53 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
import jieba
from pypinyin import lazy_pinyin
from pypinyin import Style
def extract_pinyin(source, target, use_jieba=False):
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
raw_text = re.sub(r'#\d', '', raw_text)
if use_jieba:
raw_text = jieba.lcut(raw_text)
syllables = lazy_pinyin(
raw_text,
errors='ignore',
style=Style.TONE3,
neutral_tone_with_five=True)
transcription = ' '.join(syllables)
fout.write(f'{sentence_id} {transcription}\n')
else:
continue
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
parser.add_argument(
"--use-jieba",
action='store_true',
help="use jieba for word segmentation.")
args = parser.parse_args()
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)

@ -0,0 +1,37 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def extract_pinyin_lables(source, target):
"""Extract pinyin labels from Baker's prosody labeling."""
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
fout.write(f'{sentence_id} ')
else:
transcription = line.strip()
fout.write(f'{transcription}\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
args = parser.parse_args()
extract_pinyin_lables(args.input, args.output)

@ -0,0 +1,100 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from typing import List, Union
from pathlib import Path
def erized(syllable: str) -> bool:
"""Whether the syllable contains erhua effect.
Example
--------
huar -> True
guanr -> True
er -> False
"""
# note: for pinyin, len(syllable) >=2 is always true
# if not: there is something wrong in the data
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
return syllable[:2] != "er" and syllable[-2] == 'r'
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
"""
Given a sequence of syllables from human annotation(reference),
which makes sandhi explici and a sequence of syllables from some
simple g2p program(generated), which does not consider sandhi,
return a the reference sequence while ignore sandhi.
Example
--------
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
"""
i = 0
j = 0
# sandhi ignored in the result while other errors are not included
result = []
while i < len(reference):
if erized(reference[i]):
result.append(reference[i])
i += 1
j += 2
elif reference[i][:-1] == generated[i][:-1] and reference[i][
-1] == '2' and generated[i][-1] == '3':
result.append(generated[i])
i += 1
j += 1
else:
result.append(reference[i])
i += 1
j += 1
assert j == len(
generated
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
return result
def convert_transcriptions(reference: Union[str, Path], generated: Union[str, Path], output: Union[str, Path]):
with open(reference, 'rt') as f_ref:
with open(generated, 'rt') as f_gen:
with open(output, 'wt') as f_out:
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
sentence_id, ref_transcription = ref.strip().split(' ', 1)
_, gen_transcription = gen.strip().split(' ', 1)
try:
result = ignore_sandhi(ref_transcription.split(),
gen_transcription.split())
result = ' '.join(result)
except Exception:
print(
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
)
result = ref_transcription
f_out.write(f"{sentence_id} {result}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="reference transcription but ignore sandhi.")
parser.add_argument(
"--reference",
type=str,
help="path to the reference transcription of baker dataset.")
parser.add_argument(
"--generated", type=str, help="path to the generated transcription.")
parser.add_argument("--output", type=str, help="path to save result.")
args = parser.parse_args()
convert_transcriptions(args.reference, args.generated, args.output)

@ -0,0 +1,33 @@
#!/bin/bash
exp_dir="exp"
data_dir="data"
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
archive=${data_dir}/"BZNSYP.rar"
if [ ! -f ${archive} ]; then
echo "Baker Dataset not found! Download it first to the data_dir."
exit -1
fi
MD5='c4350563bf7dc298f7dd364b2607be83'
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
if [ ${md5_result} != ${MD5} ]; then
echo "MD5 mismatch! The Archive has been changed."
exit -1
fi
label_file='ProsodyLabeling/000001-010000.txt'
filename='000001-010000.txt'
unrar e ${archive} ${label_file}
cp ${filename} ${exp_dir}
rm -f ${filename}
if [ ! -f ${exp_dir}/${filename} ];then
echo "File extraction failed!"
exit
fi
exit 0

@ -0,0 +1,8 @@
export MAIN_ROOT=${PWD}/../../
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

@ -0,0 +1,33 @@
#!/usr/bin/env bash
source path.sh
stage=-1
stop_stage=100
exp_dir=exp
data_dir=data
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
echo "stage 0: Extracting Prosody Labeling"
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
fi
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
filename="000001-010000.txt"
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "stage 1: Processing transcriptions..."
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
fi
echo "done"
exit 0

@ -2,7 +2,8 @@
## Deepspeech2
| Model | Config | Test set | WER |
| --- | --- | --- | --- |
| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 |
| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 |
| Model | release | Config | Test set | Loss | WER |
| --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 |

@ -41,7 +41,7 @@ training:
lr: 1e-3
lr_decay: 0.83
weight_decay: 1e-06
global_grad_clip: 3.0
global_grad_clip: 5.0
log_interval: 100
decoding:

@ -1,23 +0,0 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -4,7 +4,7 @@ source path.sh
stage=0
stop_stage=100
conf_path=conf/transformer.yaml
conf_path=conf/deepspeech2.yaml
avg_num=30
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

@ -14,7 +14,7 @@ data:
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
feat_dim: 80
@ -77,7 +77,7 @@ model:
training:
n_epoch: 120
accum_grad: 8
global_grad_clip: 5.0
global_grad_clip: 3.0
optim: adam
optim_conf:
lr: 0.004

@ -1,23 +0,0 @@
#! /usr/bin/env bash
if [ $# != 2 ]; then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -1,23 +0,0 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -1,23 +0,0 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -18,7 +18,6 @@ import paddle
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.modules.mask import make_pad_mask
from deepspeech.modules.mask import sequence_mask
class TestU2Model(unittest.TestCase):
@ -36,16 +35,10 @@ class TestU2Model(unittest.TestCase):
[False, False, True, True, True],
])
def test_sequence_mask(self):
res = sequence_mask(self.lengths, dtype='bool')
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
def test_make_non_pad_mask(self):
res = make_non_pad_mask(self.lengths)
res1 = sequence_mask(self.lengths, dtype='bool')
res2 = make_pad_mask(self.lengths).logical_not()
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist())
def test_make_pad_mask(self):

@ -18,3 +18,7 @@ licence: MIT
* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
licence: MIT
* [phkit](https://github.com/KuangDD/phkit.git)
commit: b2100293c1e36da531d7f30bd52c9b955a649522
licence: None

@ -0,0 +1,155 @@
![phkit](phkit.png "phkit")
## phkit
phoneme toolkit: 拼音相关的文本处理工具箱,中文和英文的语音合成前端文本解决方案。
#### 安装
```
pip install -U phkit
```
#### 版本
v0.2.8
### pinyinkit
文本转拼音的模块依赖python-pinyinjiebaphrase-pinyin-data模块。
### chinese
适用于中文、英文和中英混合的音素,其中汉字拼音采用清华大学的音素,英文字符分字母和英文。
- 中文音素简介:
```
声母:
aa b c ch d ee f g h ii j k l m n oo p q r s sh t uu vv x z zh
韵母:
a ai an ang ao e ei en eng er i ia ian iang iao ie in ing iong iu ix iy iz o ong ou u ua uai uan uang ueng ui un uo v van ve vn ng uong
声调:
1 2 3 4 5
字母:
Aa Bb Cc Dd Ee Ff Gg Hh Ii Jj Kk Ll Mm Nn Oo Pp Qq Rr Ss Tt Uu Vv Ww Xx Yy Zz
英文:
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
标点:
! ? . , ; : " # ( )
注:!=!|?=?|.=.。|,=,,、|;=;|:=:|"="“|#=#   |(=([{{【<《|)=)]}}】>》
预留:
w y 0 6 7 8 9
w=%|y=$|0=0|6=6|7=7|8=8|9=9
其他:
_ ~ - *
```
#### symbol
音素标记。
中文音素,简单英文音素,简单中文音素。
#### sequence
转为序列的方法文本转为音素列表文本转为ID列表。
拼音变调,拼音转音素。
#### pinyin
转为拼音的方法,汉字转拼音,分离声调。
拼音为字母+数字形式例如pin1。
#### phoneme
音素映射表。
不带声调拼音转为音素,声调转音素,英文字母转音素,标点转音素。
#### number
数字读法。
按数值大小读,一个一个数字读。
#### convert
文本转换。
全角半角转换,简体繁体转换。
#### style
拼音格式转换。
国标样式的拼音和字母数字的样式的拼音相互转换。
### english
from https://github.com/keithito/tacotron "
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
### 历史版本
#### v0.2.8
- 文本转拼音轻声用5表示音调。
- 文本转拼音确保文本和拼音一一对应,文本长度和拼音列表长度相同。
- 增加拼音格式转换,国标格式和字母数字格式相互转换。
#### v0.2.7
- 所有中文音素都能被映射到。
#### v0.2.5
- 修正拼音转音素的潜在bug。
#### v0.2.4
- 修正几个默认拼音。
#### v0.2.3
- 汉字转拼音轻量化。
- 词语拼音词典去除全都是默认拼音的词语。
#### v0.2.2
- 修正安装依赖报错问题。
#### v0.2.1
- 增加中文的text_to_sequence方法可替换英文版本应对中文环境。
- 兼容v0.1.0之前版本需要在python3.7版本以上否则请改为从phkit.chinese导入模块。
#### v0.2.0
- 增加文本转拼音的模块依赖python-pinyinjiebaphrase-pinyin-data模块。
- 中文的音素方案移动到chinese模块。
#### v0.1.0
- 增加英文版本的音素方案,包括英文字母和英文音素。
- 增加简单的数字转中文的方法。
#### todo
```
文本正则化处理
数字读法
字符读法
常见规则读法
文本转拼音
pypinyin
国标和alnum转换
anything转音素
字符
英文
汉字
OOV
进阶:
分词
命名实体识别
依存句法分析
```

@ -0,0 +1,115 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/17
"""
![phkit](phkit.png "phkit")
## phkit
phoneme toolkit: 拼音相关的文本处理工具箱中文和英文的语音合成前端文本解决方案
#### 安装
```
pip install -U phkit
```
"""
__version__ = "0.2.8"
version_doc = """
#### 版本
v{}
""".format(__version__)
history_doc = """
### 历史版本
#### v0.2.8
- 文本转拼音轻声用5表示音调
- 文本转拼音确保文本和拼音一一对应文本长度和拼音列表长度相同
- 增加拼音格式转换国标格式和字母数字格式相互转换
#### v0.2.7
- 所有中文音素都能被映射到
#### v0.2.5
- 修正拼音转音素的潜在bug
#### v0.2.4
- 修正几个默认拼音
#### v0.2.3
- 汉字转拼音轻量化
- 词语拼音词典去除全都是默认拼音的词语
#### v0.2.2
- 修正安装依赖报错问题
#### v0.2.1
- 增加中文的text_to_sequence方法可替换英文版本应对中文环境
- 兼容v0.1.0之前版本需要在python3.7版本以上否则请改为从phkit.chinese导入模块
#### v0.2.0
- 增加文本转拼音的模块依赖python-pinyinjiebaphrase-pinyin-data模块
- 中文的音素方案移动到chinese模块
#### v0.1.0
- 增加英文版本的音素方案包括英文字母和英文音素
- 增加简单的数字转中文的方法
#### todo
```
文本正则化处理
数字读法
字符读法
常见规则读法
文本转拼音
pypinyin
国标和alnum转换
anything转音素
字符
英文
汉字
OOV
进阶:
分词
命名实体识别
依存句法分析
```
"""
from phkit.chinese import __doc__ as doc_chinese
from phkit.chinese.symbol import __doc__ as doc_symbol
from phkit.chinese.sequence import __doc__ as doc_sequence
from phkit.chinese.pinyin import __doc__ as doc_pinyin
from phkit.chinese.phoneme import __doc__ as doc_phoneme
from phkit.chinese.number import __doc__ as doc_number
from phkit.chinese.convert import __doc__ as doc_convert
from phkit.chinese.style import __doc__ as doc_style
from .english import __doc__ as doc_english
from .pinyinkit import __doc__ as doc_pinyinkit
readme_docs = [__doc__, version_doc,
doc_pinyinkit,
doc_chinese, doc_symbol, doc_sequence, doc_pinyin, doc_phoneme, doc_number, doc_convert, doc_style,
doc_english,
history_doc]
from .chinese import text_to_sequence as chinese_text_to_sequence, sequence_to_text as chinese_sequence_to_text
from .english import text_to_sequence as english_text_to_sequence, sequence_to_text as english_sequence_to_text
from .pinyinkit import lazy_pinyin
# 兼容0.1.0之前的版本python3.7以上版本支持。
from .chinese import convert, number, phoneme, sequence, symbol, style
from .chinese.style import guobiao2shengyundiao, shengyundiao2guobiao
from .chinese.convert import fan2jian, jian2fan, quan2ban, ban2quan
from .chinese.number import say_digit, say_decimal, say_number
from .chinese.pinyin import text2pinyin, split_pinyin
from .chinese.sequence import text2sequence, text2phoneme, pinyin2phoneme, phoneme2sequence, sequence2phoneme
from .chinese.sequence import symbol_chinese, ph2id_dict, id2ph_dict
if __name__ == "__main__":
print(__file__)

@ -0,0 +1,79 @@
"""
### chinese
适用于中文英文和中英混合的音素其中汉字拼音采用清华大学的音素英文字符分字母和英文
- 中文音素简介
```
声母
aa b c ch d ee f g h ii j k l m n oo p q r s sh t uu vv x z zh
韵母
a ai an ang ao e ei en eng er i ia ian iang iao ie in ing iong iu ix iy iz o ong ou u ua uai uan uang ueng ui un uo v van ve vn ng uong
声调
1 2 3 4 5
字母
Aa Bb Cc Dd Ee Ff Gg Hh Ii Jj Kk Ll Mm Nn Oo Pp Qq Rr Ss Tt Uu Vv Ww Xx Yy Zz
英文
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
标点
! ? . , ; : " # ( )
!=!|?=?|.=.|,=,|;=;|:=:|"="|#=#  \t|(=([{{【<《|)=)]}}】>》
预留
w y 0 6 7 8 9
w=%|y=$|0=0|6=6|7=7|8=8|9=9
其他
_ ~ - *
```
"""
from .convert import fan2jian, jian2fan, quan2ban, ban2quan
from .number import say_digit, say_decimal, say_number
from .pinyin import text2pinyin, split_pinyin
from .sequence import text2sequence, text2phoneme, pinyin2phoneme, phoneme2sequence, sequence2phoneme, change_diao
from .sequence import symbol_chinese, ph2id_dict, id2ph_dict
from .symbol import symbol_chinese as symbols
from .phoneme import shengyun2ph_dict
def text_to_sequence(src, cleaner_names=None, **kwargs):
"""
文本样例卡尔普陪外孙玩滑梯
拼音样例ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1 .
:param src: str,拼音或文本字符串
:param cleaner_names: 文本处理方法选择暂时提供拼音和文本两种方法
:return: list,ID列表
"""
if cleaner_names == "pinyin":
pys = []
for py in src.split():
if py.isalnum():
pys.append(py)
else:
pys.append((py,))
phs = pinyin2phoneme(pys)
phs = change_diao(phs)
seq = phoneme2sequence(phs)
return seq
else:
return text2sequence(src)
def sequence_to_text(src):
out = sequence2phoneme(src)
return " ".join(out)
if __name__ == "__main__":
print(__file__)
text = "ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1 . "
out = text_to_sequence(text)
print(out)
out = sequence_to_text(out)
print(out)

@ -0,0 +1,51 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/17
"""
#### convert
文本转换
全角半角转换简体繁体转换
"""
from .hanziconv import HanziConv
hc = HanziConv()
# 繁体转简体
fan2jian = hc.toSimplified
# 简体转繁体
jian2fan = hc.toTraditional
# 半角转全角映射表
ban2quan_dict = {i: i + 65248 for i in range(33, 127)}
ban2quan_dict.update({32: 12288})
# 全角转半角映射表
quan2ban_dict = {v: k for k, v in ban2quan_dict.items()}
def ban2quan(text: str):
"""
半角转全角
:param text:
:return:
"""
return text.translate(ban2quan_dict)
def quan2ban(text: str):
"""
全角转半角
:param text:
:return:
"""
return text.translate(quan2ban_dict)
if __name__ == "__main__":
assert ban2quan("aA1 ,:$。、") == "aA1 ,:$。、"
assert quan2ban("aA1 ,:$。、") == "aA1 ,:$。、"
assert jian2fan("中国语言") == "中國語言"
assert fan2jian("中國語言") == "中国语言"

@ -0,0 +1,99 @@
# Copyright 2014 Bernard Yue
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
__doc__ = """
Hanzi Converter 繁簡轉換器 | 繁简转换器
This module provides functions converting chinese text between simplified and
traditional characters. It returns unicode represnetation of the text.
Class HanziConv is the main entry point of the module, you can import the
class by doing:
>>> from hanziconv import HanziConv
"""
import os
from zhon import cedict
class HanziConv():
"""This class supports hanzi (漢字) convention between simplified and
traditional format"""
__traditional_charmap = cedict.traditional
__simplified_charmap = cedict.simplified
@classmethod
def __convert(cls, text, toTraditional=True):
"""Convert `text` to Traditional characters if `toTraditional` is
True, else convert to simplified characters
:param text: data to convert
:param toTraditional: True -- convert to traditional text
False -- covert to simplified text
:returns: converted 'text`
"""
if isinstance(text, bytes):
text = text.decode('utf-8')
fromMap = cls.__simplified_charmap
toMap = cls.__traditional_charmap
if not toTraditional:
fromMap = cls.__traditional_charmap
toMap = cls.__simplified_charmap
final = []
for c in text:
index = fromMap.find(c)
if index != -1:
final.append(toMap[index])
else:
final.append(c)
return ''.join(final)
@classmethod
def toSimplified(cls, text):
"""Convert `text` to simplified character string. Assuming text is
traditional character string
:param text: text to convert
:returns: converted UTF-8 characters
>>> from hanziconv import HanziConv
>>> print(HanziConv.toSimplified('繁簡轉換器'))
繁简转换器
"""
return cls.__convert(text, toTraditional=False)
@classmethod
def toTraditional(cls, text):
"""Convert `text` to traditional character string. Assuming text is
simplified character string
:param text: text to convert
:returns: converted UTF-8 characters
>>> from hanziconv import HanziConv
>>> print(HanziConv.toTraditional('繁简转换器'))
繁簡轉換器
"""
return cls.__convert(text, toTraditional=True)
@classmethod
def same(cls, text1, text2):
"""Return True if text1 and text2 meant literally the same, False
otherwise
:param text1: string to compare to ``text2``
:param text2: string to compare to ``text1``
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
**False** -- otherwise
>>> from hanziconv import HanziConv
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
True
"""
t1 = cls.toSimplified(text1)
t2 = cls.toSimplified(text2)
return t1 == t2

@ -0,0 +1,90 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/16
"""
#### number
数字读法
按数值大小读一个一个数字读
"""
import re
_number_cn = ['', '', '', '', '', '', '', '', '', '']
_number_level = ['', '', '', '', '', '', '', '亿', '', '', '', '', '', '', '', '']
_zero = _number_cn[0]
_ten_re = re.compile(r'^一十')
_grade_level = {'', '亿', ''}
_number_group_re = re.compile(r"([0-9]+)")
def say_digit(num: str) -> str:
"""123 -> 一二三
Args:
num (str): digit
Returns:
str: hanzi number
"""
outs = []
for zi in num:
outs.append(_number_cn[int(zi)])
return ''.join(outs)
def say_number(num: str):
x = str(int(num))
if x == '0':
return _number_cn[0]
elif len(x) > 16:
return num
length = len(x)
outs = []
for num, zi in enumerate(x):
a = _number_cn[int(zi)]
b = _number_level[len(_number_level) - length + num]
if a != _zero:
outs.append(a)
outs.append(b)
else:
if b in _grade_level:
if outs[-1] != _zero:
outs.append(b)
else:
outs[-1] = b
else:
if outs[-1] != _zero:
outs.append(a)
out = ''.join(outs[:-1])
out = _ten_re.sub(r'', out)
return out
def say_decimal(num: str):
z, x = num.split('.')
z_cn = say_number(z)
x_cn = say_digit(x)
return z_cn + '' + x_cn
def convert_number(text):
parts = _number_group_re.split(text)
outs = []
for elem in parts:
if elem.isdigit():
if len(elem) <= 9:
outs.append(say_number(elem))
else:
outs.append(say_digit(elem))
else:
outs.append(elem)
return ''.join(outs)
if __name__ == "__main__":
print(__file__)
assert say_number("1234567890123456") == "一千二百三十四万五千六百七十八亿九千零一十二万三千四百五十六"
assert say_digit("123456") == "一二三四五六"
assert say_decimal("3.14") == "三点一四"
assert convert_number("hello314.1592and2718281828") == "hello三百一十四.一千五百九十二and二七一八二八一八二八"

@ -0,0 +1,480 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/16
"""
#### phoneme
音素映射表
不带声调拼音转为音素声调转音素英文字母转音素标点转音素
"""
# 拼音转音素映射表420
shengyun2ph_dict = {
'a': 'aa a',
'ai': 'aa ai',
'an': 'aa an',
'ang': 'aa ang',
'ao': 'aa ao',
'ba': 'b a',
'bai': 'b ai',
'ban': 'b an',
'bang': 'b ang',
'bao': 'b ao',
'bei': 'b ei',
'ben': 'b en',
'beng': 'b eng',
'bi': 'b i',
'bian': 'b ian',
'biao': 'b iao',
'bie': 'b ie',
'bin': 'b in',
'bing': 'b ing',
'bo': 'b o',
'bu': 'b u',
'ca': 'c a',
'cai': 'c ai',
'can': 'c an',
'cang': 'c ang',
'cao': 'c ao',
'ce': 'c e',
'cen': 'c en',
'ceng': 'c eng',
'ci': 'c iy',
'cong': 'c ong',
'cou': 'c ou',
'cu': 'c u',
'cuan': 'c uan',
'cui': 'c ui',
'cun': 'c un',
'cuo': 'c uo',
'cha': 'ch a',
'chai': 'ch ai',
'chan': 'ch an',
'chang': 'ch ang',
'chao': 'ch ao',
'che': 'ch e',
'chen': 'ch en',
'cheng': 'ch eng',
'chi': 'ch ix',
'chong': 'ch ong',
'chou': 'ch ou',
'chu': 'ch u',
'chuai': 'ch uai',
'chuan': 'ch uan',
'chuang': 'ch uang',
'chui': 'ch ui',
'chun': 'ch un',
'chuo': 'ch uo',
'da': 'd a',
'dai': 'd ai',
'dan': 'd an',
'dang': 'd ang',
'dao': 'd ao',
'de': 'd e',
'dei': 'd ei',
'deng': 'd eng',
'di': 'd i',
'dia': 'd ia',
'dian': 'd ian',
'diao': 'd iao',
'die': 'd ie',
'ding': 'd ing',
'diu': 'd iu',
'dong': 'd ong',
'dou': 'd ou',
'du': 'd u',
'duan': 'd uan',
'dui': 'd ui',
'dun': 'd un',
'duo': 'd uo',
'e': 'ee e',
'ei': 'ee ei',
'en': 'ee en',
'er': 'ee er',
'fa': 'f a',
'fan': 'f an',
'fang': 'f ang',
'fei': 'f ei',
'fen': 'f en',
'feng': 'f eng',
'fo': 'f o',
'fou': 'f ou',
'fu': 'f u',
'ga': 'g a',
'gai': 'g ai',
'gan': 'g an',
'gang': 'g ang',
'gao': 'g ao',
'ge': 'g e',
'gei': 'g ei',
'gen': 'g en',
'geng': 'g eng',
'gong': 'g ong',
'gou': 'g ou',
'gu': 'g u',
'gua': 'g ua',
'guai': 'g uai',
'guan': 'g uan',
'guang': 'g uang',
'gui': 'g ui',
'gun': 'g un',
'guo': 'g uo',
'ha': 'h a',
'hai': 'h ai',
'han': 'h an',
'hang': 'h ang',
'hao': 'h ao',
'he': 'h e',
'hei': 'h ei',
'hen': 'h en',
'heng': 'h eng',
'hong': 'h ong',
'hou': 'h ou',
'hu': 'h u',
'hua': 'h ua',
'huai': 'h uai',
'huan': 'h uan',
'huang': 'h uang',
'hui': 'h ui',
'hun': 'h un',
'huo': 'h uo',
'yi': 'ii i',
'ya': 'ii ia',
'yan': 'ii ian',
'yang': 'ii iang',
'yao': 'ii iao',
'ye': 'ii ie',
'yin': 'ii in',
'ying': 'ii ing',
'yong': 'ii iong',
'you': 'ii iu',
'ji': 'j i',
'jia': 'j ia',
'jian': 'j ian',
'jiang': 'j iang',
'jiao': 'j iao',
'jie': 'j ie',
'jin': 'j in',
'jing': 'j ing',
'jiong': 'j iong',
'jiu': 'j iu',
'ju': 'j v',
'juan': 'j van',
'jue': 'j ve',
'jun': 'j vn',
'ka': 'k a',
'kai': 'k ai',
'kan': 'k an',
'kang': 'k ang',
'kao': 'k ao',
'ke': 'k e',
'ken': 'k en',
'keng': 'k eng',
'kong': 'k ong',
'kou': 'k ou',
'ku': 'k u',
'kua': 'k ua',
'kuai': 'k uai',
'kuan': 'k uan',
'kuang': 'k uang',
'kui': 'k ui',
'kun': 'k un',
'kuo': 'k uo',
'la': 'l a',
'lai': 'l ai',
'lan': 'l an',
'lang': 'l ang',
'lao': 'l ao',
'le': 'l e',
'lei': 'l ei',
'leng': 'l eng',
'li': 'l i',
'lia': 'l ia',
'lian': 'l ian',
'liang': 'l iang',
'liao': 'l iao',
'lie': 'l ie',
'lin': 'l in',
'ling': 'l ing',
'liu': 'l iu',
'lo': 'l o',
'long': 'l ong',
'lou': 'l ou',
'lu': 'l u',
'luan': 'l uan',
'lun': 'l un',
'luo': 'l uo',
'lv': 'l v',
'lve': 'l ve',
'ma': 'm a',
'mai': 'm ai',
'man': 'm an',
'mang': 'm ang',
'mao': 'm ao',
'me': 'm e',
'mei': 'm ei',
'men': 'm en',
'meng': 'm eng',
'mi': 'm i',
'mian': 'm ian',
'miao': 'm iao',
'mie': 'm ie',
'min': 'm in',
'ming': 'm ing',
'miu': 'm iu',
'mo': 'm o',
'mou': 'm ou',
'mu': 'm u',
'na': 'n a',
'nai': 'n ai',
'nan': 'n an',
'nang': 'n ang',
'nao': 'n ao',
'ne': 'n e',
'nei': 'n ei',
'nen': 'n en',
'neng': 'n eng',
'ni': 'n i',
'nian': 'n ian',
'niang': 'n iang',
'niao': 'n iao',
'nie': 'n ie',
'nin': 'n in',
'ning': 'n ing',
'niu': 'n iu',
'nong': 'n ong',
'nu': 'n u',
'nuan': 'n uan',
'nuo': 'n uo',
'nv': 'n v',
'nve': 'n ve',
'o': 'oo o',
'ou': 'oo ou',
'pa': 'p a',
'pai': 'p ai',
'pan': 'p an',
'pang': 'p ang',
'pao': 'p ao',
'pei': 'p ei',
'pen': 'p en',
'peng': 'p eng',
'pi': 'p i',
'pian': 'p ian',
'piao': 'p iao',
'pie': 'p ie',
'pin': 'p in',
'ping': 'p ing',
'po': 'p o',
'pou': 'p ou',
'pu': 'p u',
'qi': 'q i',
'qia': 'q ia',
'qian': 'q ian',
'qiang': 'q iang',
'qiao': 'q iao',
'qie': 'q ie',
'qin': 'q in',
'qing': 'q ing',
'qiong': 'q iong',
'qiu': 'q iu',
'qu': 'q v',
'quan': 'q van',
'que': 'q ve',
'qun': 'q vn',
'ran': 'r an',
'rang': 'r ang',
'rao': 'r ao',
're': 'r e',
'ren': 'r en',
'reng': 'r eng',
'ri': 'r iz',
'rong': 'r ong',
'rou': 'r ou',
'ru': 'r u',
'ruan': 'r uan',
'rui': 'r ui',
'run': 'r un',
'ruo': 'r uo',
'sa': 's a',
'sai': 's ai',
'san': 's an',
'sang': 's ang',
'sao': 's ao',
'se': 's e',
'sen': 's en',
'seng': 's eng',
'si': 's iy',
'song': 's ong',
'sou': 's ou',
'su': 's u',
'suan': 's uan',
'sui': 's ui',
'sun': 's un',
'suo': 's uo',
'sha': 'sh a',
'shai': 'sh ai',
'shan': 'sh an',
'shang': 'sh ang',
'shao': 'sh ao',
'she': 'sh e',
'shei': 'sh ei',
'shen': 'sh en',
'sheng': 'sh eng',
'shi': 'sh ix',
'shou': 'sh ou',
'shu': 'sh u',
'shua': 'sh ua',
'shuai': 'sh uai',
'shuan': 'sh uan',
'shuang': 'sh uang',
'shui': 'sh ui',
'shun': 'sh un',
'shuo': 'sh uo',
'ta': 't a',
'tai': 't ai',
'tan': 't an',
'tang': 't ang',
'tao': 't ao',
'te': 't e',
'teng': 't eng',
'ti': 't i',
'tian': 't ian',
'tiao': 't iao',
'tie': 't ie',
'ting': 't ing',
'tong': 't ong',
'tou': 't ou',
'tu': 't u',
'tuan': 't uan',
'tui': 't ui',
'tun': 't un',
'tuo': 't uo',
'wu': 'uu u',
'wa': 'uu ua',
'wai': 'uu uai',
'wan': 'uu uan',
'wang': 'uu uang',
'weng': 'uu ueng',
'wei': 'uu ui',
'wen': 'uu un',
'wo': 'uu uo',
'yu': 'vv v',
'yuan': 'vv van',
'yue': 'vv ve',
'yun': 'vv vn',
'xi': 'x i',
'xia': 'x ia',
'xian': 'x ian',
'xiang': 'x iang',
'xiao': 'x iao',
'xie': 'x ie',
'xin': 'x in',
'xing': 'x ing',
'xiong': 'x iong',
'xiu': 'x iu',
'xu': 'x v',
'xuan': 'x van',
'xue': 'x ve',
'xun': 'x vn',
'za': 'z a',
'zai': 'z ai',
'zan': 'z an',
'zang': 'z ang',
'zao': 'z ao',
'ze': 'z e',
'zei': 'z ei',
'zen': 'z en',
'zeng': 'z eng',
'zi': 'z iy',
'zong': 'z ong',
'zou': 'z ou',
'zu': 'z u',
'zuan': 'z uan',
'zui': 'z ui',
'zun': 'z un',
'zuo': 'z uo',
'zha': 'zh a',
'zhai': 'zh ai',
'zhan': 'zh an',
'zhang': 'zh ang',
'zhao': 'zh ao',
'zhe': 'zh e',
'zhei': 'zh ei',
'zhen': 'zh en',
'zheng': 'zh eng',
'zhi': 'zh ix',
'zhong': 'zh ong',
'zhou': 'zh ou',
'zhu': 'zh u',
'zhua': 'zh ua',
'zhuai': 'zh uai',
'zhuan': 'zh uan',
'zhuang': 'zh uang',
'zhui': 'zh ui',
'zhun': 'zh un',
'zhuo': 'zh uo',
'cei': 'c ei',
'chua': 'ch ua',
'den': 'd en',
'din': 'd in',
'eng': 'ee eng',
'ng': 'ee ng',
'fiao': 'f iao',
'yo': 'ii o',
'kei': 'k ei',
'len': 'l en',
'nia': 'n ia',
'nou': 'n ou',
'nun': 'n un',
'rua': 'r ua',
'tei': 't ei',
'wong': 'uu uong',
'n': 'n ng'
}
diao2ph_dict = {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5'}
# 字母音素26
_alphabet = 'Aa Bb Cc Dd Ee Ff Gg Hh Ii Jj Kk Ll Mm Nn Oo Pp Qq Rr Ss Tt Uu Vv Ww Xx Yy Zz'.split()
# 字母26
_upper = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
_lower = list('abcdefghijklmnopqrstuvwxyz')
upper2ph_dict = dict(zip(_upper, _alphabet))
lower2ph_dict = dict(zip(_lower, _upper))
# 标点9
_biaodian = '! ? . , ; : " # ( )'.split()
# 注:!=!|?=?|.=.。|,=,,、|;=;|:=:|"="“”'|#=  \t|(=([{{【<《|)=)]}}】>》
biao2ph_dict = {
'!': '!', '': '!',
'?': '?', '': '?',
'.': '.', '': '.',
',': ',', '': ',', '': ',',
';': ';', '': ';',
':': ':', '': ':',
'"': '"', '': '"', '': '"', "'": '"', '': '"', '': '"',
'#': '#', '': '#', ' ': '#', ' ': '#', '\t': '#',
'(': '(', '': '(', '[': '(', '': '(', '{': '(', '': '(', '': '(', '<': '(', '': '(',
')': ')', '': ')', ']': ')', '': ')', '}': ')', '': ')', '': ')', '>': ')', '': ')'
}
# 其他7
_other = 'w y 0 6 7 8 9'.split()
other2ph_dict = {
'%': 'w',
'$': 'y',
'0': '0',
'6': '6',
'7': '7',
'8': '8',
'9': '9'
}
char2ph_dict = {**upper2ph_dict, **lower2ph_dict, **biao2ph_dict, **other2ph_dict}
if __name__ == "__main__":
print(__file__)

@ -0,0 +1,11 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/17
"""
#### pinyin
转为拼音的方法汉字转拼音分离声调
拼音为字母+数字形式例如pin1
"""
from ..pinyinkit import text2pinyin, split_pinyin

@ -0,0 +1,153 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/16
"""
#### sequence
转为序列的方法文本转为音素列表文本转为ID列表
拼音变调拼音转音素
"""
from .phoneme import shengyun2ph_dict, diao2ph_dict, char2ph_dict
from .pinyin import text2pinyin, split_pinyin
from .symbol import _chain, _eos, _pad, symbol_chinese
from .convert import fan2jian, quan2ban
from .number import convert_number
import re
# 分隔英文字母
_en_re = re.compile(r"([a-zA-Z]+)")
phs = ({w for p in shengyun2ph_dict.values() for w in p.split()}
| set(diao2ph_dict.values()) | set(char2ph_dict.values()))
assert bool(phs - set(symbol_chinese)) is False
ph2id_dict = {p: i for i, p in enumerate(symbol_chinese)}
id2ph_dict = {i: p for i, p in enumerate(symbol_chinese)}
assert len(ph2id_dict) == len(id2ph_dict)
def text2phoneme(text):
"""
文本转为音素用中文音素方案
中文转为拼音按照清华大学方案转为音素分为辅音元音音调
英文全部大写转为字母读音
英文非全部大写转为英文读音
标点映射为音素
:param text: str,正则化后的文本
:return: list,音素列表
"""
text = normalize_chinese(text)
text = normalize_english(text)
pys = text2pinyin(text, errors=lambda x: (x,))
phs = pinyin2phoneme(pys)
phs = change_diao(phs)
return phs
def text2sequence(text):
"""
文本转为ID序列
:param text:
:return:
"""
phs = text2phoneme(text)
seq = phoneme2sequence(phs)
return seq
def pinyin2phoneme(src):
"""
拼音或其他字符转音素
:param src: list,拼音用str格式其他用tuple格式
:return: list
"""
out = []
for py in src:
if type(py) is str:
fuyuan, diao = split_pinyin(py)
if fuyuan in shengyun2ph_dict and diao in diao2ph_dict:
phs = shengyun2ph_dict[fuyuan].split()
phs.append(diao2ph_dict[diao])
else:
phs = py_errors(py)
else:
phs = []
for w in py:
ph = py_errors(w)
phs.extend(ph)
if phs:
out.extend(phs)
out.append(_chain)
out.append(_eos)
out.append(_pad)
return out
def change_diao(src):
"""
拼音变声调连续上声声调的把前一个上声变为阳平
:param src: list,音素列表
:return: list,变调后的音素列表
"""
flag = -5
out = []
for i, w in enumerate(reversed(src)):
if w == '3':
if i - flag == 4:
out.append('2')
else:
flag = i
out.append(w)
else:
out.append(w)
return list(reversed(out))
def phoneme2sequence(src):
out = []
for w in src:
if w in ph2id_dict:
out.append(ph2id_dict[w])
return out
def sequence2phoneme(src):
out = []
for w in src:
if w in id2ph_dict:
out.append(id2ph_dict[w])
return out
def py_errors(text):
out = []
for p in text:
if p in char2ph_dict:
out.append(char2ph_dict[p])
return out
def normalize_chinese(text):
text = quan2ban(text)
text = fan2jian(text)
text = convert_number(text)
return text
def normalize_english(text):
out = []
parts = _en_re.split(text)
for part in parts:
if not part.isupper():
out.append(part.lower())
else:
out.append(part)
return "".join(out)
if __name__ == "__main__":
print(__file__)

@ -0,0 +1,339 @@
# author: kuangdd
# date: 2021/5/8
"""
#### style
拼音格式转换
国标样式的拼音和字母数字的样式的拼音相互转换
"""
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(Path(__file__).stem)
# 2100 = 420 * 5
guobiao2shengyundiao_dict = {
'a': 'a5', 'ā': 'a1', 'á': 'a2', 'ǎ': 'a3', 'à': 'a4', 'ai': 'ai5', 'āi': 'ai1', 'ái': 'ai2', 'ǎi': 'ai3',
'ài': 'ai4', 'an': 'an5', 'ān': 'an1', 'án': 'an2', 'ǎn': 'an3', 'àn': 'an4', 'ang': 'ang5', 'āng': 'ang1',
'áng': 'ang2', 'ǎng': 'ang3', 'àng': 'ang4', 'ao': 'ao5', 'āo': 'ao1', 'áo': 'ao2', 'ǎo': 'ao3', 'ào': 'ao4',
'ba': 'ba5', '': 'ba1', '': 'ba2', '': 'ba3', '': 'ba4', 'bai': 'bai5', 'bāi': 'bai1', 'bái': 'bai2',
'bǎi': 'bai3', 'bài': 'bai4', 'ban': 'ban5', 'bān': 'ban1', 'bán': 'ban2', 'bǎn': 'ban3', 'bàn': 'ban4',
'bang': 'bang5', 'bāng': 'bang1', 'báng': 'bang2', 'bǎng': 'bang3', 'bàng': 'bang4', 'bao': 'bao5', 'bāo': 'bao1',
'báo': 'bao2', 'bǎo': 'bao3', 'bào': 'bao4', 'bei': 'bei5', 'bēi': 'bei1', 'béi': 'bei2', 'běi': 'bei3',
'bèi': 'bei4', 'ben': 'ben5', 'bēn': 'ben1', 'bén': 'ben2', 'běn': 'ben3', 'bèn': 'ben4', 'beng': 'beng5',
'bēng': 'beng1', 'béng': 'beng2', 'běng': 'beng3', 'bèng': 'beng4', 'bi': 'bi5', '': 'bi1', '': 'bi2',
'': 'bi3', '': 'bi4', 'bian': 'bian5', 'biān': 'bian1', 'bián': 'bian2', 'biǎn': 'bian3', 'biàn': 'bian4',
'biao': 'biao5', 'biāo': 'biao1', 'biáo': 'biao2', 'biǎo': 'biao3', 'biào': 'biao4', 'bie': 'bie5', 'biē': 'bie1',
'bié': 'bie2', 'biě': 'bie3', 'biè': 'bie4', 'bin': 'bin5', 'bīn': 'bin1', 'bín': 'bin2', 'bǐn': 'bin3',
'bìn': 'bin4', 'bing': 'bing5', 'bīng': 'bing1', 'bíng': 'bing2', 'bǐng': 'bing3', 'bìng': 'bing4', 'bo': 'bo5',
'': 'bo1', '': 'bo2', '': 'bo3', '': 'bo4', 'bu': 'bu5', '': 'bu1', '': 'bu2', '': 'bu3', '': 'bu4',
'ca': 'ca5', '': 'ca1', '': 'ca2', '': 'ca3', '': 'ca4', 'cai': 'cai5', 'cāi': 'cai1', 'cái': 'cai2',
'cǎi': 'cai3', 'cài': 'cai4', 'can': 'can5', 'cān': 'can1', 'cán': 'can2', 'cǎn': 'can3', 'càn': 'can4',
'cang': 'cang5', 'cāng': 'cang1', 'cáng': 'cang2', 'cǎng': 'cang3', 'càng': 'cang4', 'cao': 'cao5', 'cāo': 'cao1',
'cáo': 'cao2', 'cǎo': 'cao3', 'cào': 'cao4', 'ce': 'ce5', '': 'ce1', '': 'ce2', '': 'ce3', '': 'ce4',
'cen': 'cen5', 'cēn': 'cen1', 'cén': 'cen2', 'cěn': 'cen3', 'cèn': 'cen4', 'ceng': 'ceng5', 'cēng': 'ceng1',
'céng': 'ceng2', 'cěng': 'ceng3', 'cèng': 'ceng4', 'cha': 'cha5', 'chā': 'cha1', 'chá': 'cha2', 'chǎ': 'cha3',
'chà': 'cha4', 'chai': 'chai5', 'chāi': 'chai1', 'chái': 'chai2', 'chǎi': 'chai3', 'chài': 'chai4', 'chan': 'chan5',
'chān': 'chan1', 'chán': 'chan2', 'chǎn': 'chan3', 'chàn': 'chan4', 'chang': 'chang5', 'chāng': 'chang1',
'cháng': 'chang2', 'chǎng': 'chang3', 'chàng': 'chang4', 'chao': 'chao5', 'chāo': 'chao1', 'cháo': 'chao2',
'chǎo': 'chao3', 'chào': 'chao4', 'che': 'che5', 'chē': 'che1', 'ché': 'che2', 'chě': 'che3', 'chè': 'che4',
'chen': 'chen5', 'chēn': 'chen1', 'chén': 'chen2', 'chěn': 'chen3', 'chèn': 'chen4', 'cheng': 'cheng5',
'chēng': 'cheng1', 'chéng': 'cheng2', 'chěng': 'cheng3', 'chèng': 'cheng4', 'chi': 'chi5', 'chī': 'chi1',
'chí': 'chi2', 'chǐ': 'chi3', 'chì': 'chi4', 'chong': 'chong5', 'chōng': 'chong1', 'chóng': 'chong2',
'chǒng': 'chong3', 'chòng': 'chong4', 'chou': 'chou5', 'chōu': 'chou1', 'chóu': 'chou2', 'chǒu': 'chou3',
'chòu': 'chou4', 'chu': 'chu5', 'chū': 'chu1', 'chú': 'chu2', 'chǔ': 'chu3', 'chù': 'chu4', 'chuai': 'chuai5',
'chuāi': 'chuai1', 'chuái': 'chuai2', 'chuǎi': 'chuai3', 'chuài': 'chuai4', 'chuan': 'chuan5', 'chuān': 'chuan1',
'chuán': 'chuan2', 'chuǎn': 'chuan3', 'chuàn': 'chuan4', 'chuang': 'chuang5', 'chuāng': 'chuang1',
'chuáng': 'chuang2', 'chuǎng': 'chuang3', 'chuàng': 'chuang4', 'chui': 'chui5', 'chuī': 'chui1', 'chuí': 'chui2',
'chuǐ': 'chui3', 'chuì': 'chui4', 'chun': 'chun5', 'chūn': 'chun1', 'chún': 'chun2', 'chǔn': 'chun3',
'chùn': 'chun4', 'chuo': 'chuo5', 'chuō': 'chuo1', 'chuó': 'chuo2', 'chuǒ': 'chuo3', 'chuò': 'chuo4', 'ci': 'ci5',
'': 'ci1', '': 'ci2', '': 'ci3', '': 'ci4', 'cong': 'cong5', 'cōng': 'cong1', 'cóng': 'cong2',
'cǒng': 'cong3', 'còng': 'cong4', 'cou': 'cou5', 'cōu': 'cou1', 'cóu': 'cou2', 'cǒu': 'cou3', 'còu': 'cou4',
'cu': 'cu5', '': 'cu1', '': 'cu2', '': 'cu3', '': 'cu4', 'cuan': 'cuan5', 'cuān': 'cuan1', 'cuán': 'cuan2',
'cuǎn': 'cuan3', 'cuàn': 'cuan4', 'cui': 'cui5', 'cuī': 'cui1', 'cuí': 'cui2', 'cuǐ': 'cui3', 'cuì': 'cui4',
'cun': 'cun5', 'cūn': 'cun1', 'cún': 'cun2', 'cǔn': 'cun3', 'cùn': 'cun4', 'cuo': 'cuo5', 'cuō': 'cuo1',
'cuó': 'cuo2', 'cuǒ': 'cuo3', 'cuò': 'cuo4', 'da': 'da5', '': 'da1', '': 'da2', '': 'da3', '': 'da4',
'dai': 'dai5', 'dāi': 'dai1', 'dái': 'dai2', 'dǎi': 'dai3', 'dài': 'dai4', 'dan': 'dan5', 'dān': 'dan1',
'dán': 'dan2', 'dǎn': 'dan3', 'dàn': 'dan4', 'dang': 'dang5', 'dāng': 'dang1', 'dáng': 'dang2', 'dǎng': 'dang3',
'dàng': 'dang4', 'dao': 'dao5', 'dāo': 'dao1', 'dáo': 'dao2', 'dǎo': 'dao3', 'dào': 'dao4', 'de': 'de5',
'': 'de1', '': 'de2', '': 'de3', '': 'de4', 'dei': 'dei5', 'dēi': 'dei1', 'déi': 'dei2', 'děi': 'dei3',
'dèi': 'dei4', 'den': 'den5', 'dēn': 'den1', 'dén': 'den2', 'děn': 'den3', 'dèn': 'den4', 'deng': 'deng5',
'dēng': 'deng1', 'déng': 'deng2', 'děng': 'deng3', 'dèng': 'deng4', 'di': 'di5', '': 'di1', '': 'di2',
'': 'di3', '': 'di4', 'dia': 'dia5', 'diā': 'dia1', 'diá': 'dia2', 'diǎ': 'dia3', 'dià': 'dia4',
'dian': 'dian5', 'diān': 'dian1', 'dián': 'dian2', 'diǎn': 'dian3', 'diàn': 'dian4', 'diao': 'diao5',
'diāo': 'diao1', 'diáo': 'diao2', 'diǎo': 'diao3', 'diào': 'diao4', 'die': 'die5', 'diē': 'die1', 'dié': 'die2',
'diě': 'die3', 'diè': 'die4', 'ding': 'ding5', 'dīng': 'ding1', 'díng': 'ding2', 'dǐng': 'ding3', 'dìng': 'ding4',
'diu': 'diu5', 'diū': 'diu1', 'diú': 'diu2', 'diǔ': 'diu3', 'diù': 'diu4', 'dong': 'dong5', 'dōng': 'dong1',
'dóng': 'dong2', 'dǒng': 'dong3', 'dòng': 'dong4', 'dou': 'dou5', 'dōu': 'dou1', 'dóu': 'dou2', 'dǒu': 'dou3',
'dòu': 'dou4', 'du': 'du5', '': 'du1', '': 'du2', '': 'du3', '': 'du4', 'duan': 'duan5', 'duān': 'duan1',
'duán': 'duan2', 'duǎn': 'duan3', 'duàn': 'duan4', 'dui': 'dui5', 'duī': 'dui1', 'duí': 'dui2', 'duǐ': 'dui3',
'duì': 'dui4', 'dun': 'dun5', 'dūn': 'dun1', 'dún': 'dun2', 'dǔn': 'dun3', 'dùn': 'dun4', 'duo': 'duo5',
'duō': 'duo1', 'duó': 'duo2', 'duǒ': 'duo3', 'duò': 'duo4', 'e': 'e5', 'ē': 'e1', 'é': 'e2', 'ě': 'e3', 'è': 'e4',
'ei': 'ei5', 'ēi': 'ei1', 'éi': 'ei2', 'ěi': 'ei3', 'èi': 'ei4', 'en': 'en5', 'ēn': 'en1', 'én': 'en2', 'ěn': 'en3',
'èn': 'en4', 'eng': 'eng5', 'ēng': 'eng1', 'éng': 'eng2', 'ěng': 'eng3', 'èng': 'eng4', 'er': 'er5', 'ēr': 'er1',
'ér': 'er2', 'ěr': 'er3', 'èr': 'er4', 'fa': 'fa5', '': 'fa1', '': 'fa2', '': 'fa3', '': 'fa4',
'fan': 'fan5', 'fān': 'fan1', 'fán': 'fan2', 'fǎn': 'fan3', 'fàn': 'fan4', 'fang': 'fang5', 'fāng': 'fang1',
'fáng': 'fang2', 'fǎng': 'fang3', 'fàng': 'fang4', 'fei': 'fei5', 'fēi': 'fei1', 'féi': 'fei2', 'fěi': 'fei3',
'fèi': 'fei4', 'fen': 'fen5', 'fēn': 'fen1', 'fén': 'fen2', 'fěn': 'fen3', 'fèn': 'fen4', 'feng': 'feng5',
'fēng': 'feng1', 'féng': 'feng2', 'fěng': 'feng3', 'fèng': 'feng4', 'fo': 'fo5', '': 'fo1', '': 'fo2',
'': 'fo3', '': 'fo4', 'fou': 'fou5', 'fōu': 'fou1', 'fóu': 'fou2', 'fǒu': 'fou3', 'fòu': 'fou4', 'fu': 'fu5',
'': 'fu1', '': 'fu2', '': 'fu3', '': 'fu4', 'ga': 'ga5', '': 'ga1', '': 'ga2', '': 'ga3', '': 'ga4',
'gai': 'gai5', 'gāi': 'gai1', 'gái': 'gai2', 'gǎi': 'gai3', 'gài': 'gai4', 'gan': 'gan5', 'gān': 'gan1',
'gán': 'gan2', 'gǎn': 'gan3', 'gàn': 'gan4', 'gang': 'gang5', 'gāng': 'gang1', 'gáng': 'gang2', 'gǎng': 'gang3',
'gàng': 'gang4', 'gao': 'gao5', 'gāo': 'gao1', 'gáo': 'gao2', 'gǎo': 'gao3', 'gào': 'gao4', 'ge': 'ge5',
'': 'ge1', '': 'ge2', '': 'ge3', '': 'ge4', 'gei': 'gei5', 'gēi': 'gei1', 'géi': 'gei2', 'gěi': 'gei3',
'gèi': 'gei4', 'gen': 'gen5', 'gēn': 'gen1', 'gén': 'gen2', 'gěn': 'gen3', 'gèn': 'gen4', 'geng': 'geng5',
'gēng': 'geng1', 'géng': 'geng2', 'gěng': 'geng3', 'gèng': 'geng4', 'gong': 'gong5', 'gōng': 'gong1',
'góng': 'gong2', 'gǒng': 'gong3', 'gòng': 'gong4', 'gou': 'gou5', 'gōu': 'gou1', 'góu': 'gou2', 'gǒu': 'gou3',
'gòu': 'gou4', 'gu': 'gu5', '': 'gu1', '': 'gu2', '': 'gu3', '': 'gu4', 'gua': 'gua5', 'guā': 'gua1',
'guá': 'gua2', 'guǎ': 'gua3', 'guà': 'gua4', 'guai': 'guai5', 'guāi': 'guai1', 'guái': 'guai2', 'guǎi': 'guai3',
'guài': 'guai4', 'guan': 'guan5', 'guān': 'guan1', 'guán': 'guan2', 'guǎn': 'guan3', 'guàn': 'guan4',
'guang': 'guang5', 'guāng': 'guang1', 'guáng': 'guang2', 'guǎng': 'guang3', 'guàng': 'guang4', 'gui': 'gui5',
'guī': 'gui1', 'guí': 'gui2', 'guǐ': 'gui3', 'guì': 'gui4', 'gun': 'gun5', 'gūn': 'gun1', 'gún': 'gun2',
'gǔn': 'gun3', 'gùn': 'gun4', 'guo': 'guo5', 'guō': 'guo1', 'guó': 'guo2', 'guǒ': 'guo3', 'guò': 'guo4',
'ha': 'ha5', '': 'ha1', '': 'ha2', '': 'ha3', '': 'ha4', 'hai': 'hai5', 'hāi': 'hai1', 'hái': 'hai2',
'hǎi': 'hai3', 'hài': 'hai4', 'han': 'han5', 'hān': 'han1', 'hán': 'han2', 'hǎn': 'han3', 'hàn': 'han4',
'hang': 'hang5', 'hāng': 'hang1', 'háng': 'hang2', 'hǎng': 'hang3', 'hàng': 'hang4', 'hao': 'hao5', 'hāo': 'hao1',
'háo': 'hao2', 'hǎo': 'hao3', 'hào': 'hao4', 'he': 'he5', '': 'he1', '': 'he2', '': 'he3', '': 'he4',
'hei': 'hei5', 'hēi': 'hei1', 'héi': 'hei2', 'hěi': 'hei3', 'hèi': 'hei4', 'hen': 'hen5', 'hēn': 'hen1',
'hén': 'hen2', 'hěn': 'hen3', 'hèn': 'hen4', 'heng': 'heng5', 'hēng': 'heng1', 'héng': 'heng2', 'hěng': 'heng3',
'hèng': 'heng4', 'hong': 'hong5', 'hōng': 'hong1', 'hóng': 'hong2', 'hǒng': 'hong3', 'hòng': 'hong4', 'hou': 'hou5',
'hōu': 'hou1', 'hóu': 'hou2', 'hǒu': 'hou3', 'hòu': 'hou4', 'hu': 'hu5', '': 'hu1', '': 'hu2', '': 'hu3',
'': 'hu4', 'hua': 'hua5', 'huā': 'hua1', 'huá': 'hua2', 'huǎ': 'hua3', 'huà': 'hua4', 'huai': 'huai5',
'huāi': 'huai1', 'huái': 'huai2', 'huǎi': 'huai3', 'huài': 'huai4', 'huan': 'huan5', 'huān': 'huan1',
'huán': 'huan2', 'huǎn': 'huan3', 'huàn': 'huan4', 'huang': 'huang5', 'huāng': 'huang1', 'huáng': 'huang2',
'huǎng': 'huang3', 'huàng': 'huang4', 'hui': 'hui5', 'huī': 'hui1', 'huí': 'hui2', 'huǐ': 'hui3', 'huì': 'hui4',
'hun': 'hun5', 'hūn': 'hun1', 'hún': 'hun2', 'hǔn': 'hun3', 'hùn': 'hun4', 'huo': 'huo5', 'huō': 'huo1',
'huó': 'huo2', 'huǒ': 'huo3', 'huò': 'huo4', 'ji': 'ji5', '': 'ji1', '': 'ji2', '': 'ji3', '': 'ji4',
'jia': 'jia5', 'jiā': 'jia1', 'jiá': 'jia2', 'jiǎ': 'jia3', 'jià': 'jia4', 'jian': 'jian5', 'jiān': 'jian1',
'jián': 'jian2', 'jiǎn': 'jian3', 'jiàn': 'jian4', 'jiang': 'jiang5', 'jiāng': 'jiang1', 'jiáng': 'jiang2',
'jiǎng': 'jiang3', 'jiàng': 'jiang4', 'jiao': 'jiao5', 'jiāo': 'jiao1', 'jiáo': 'jiao2', 'jiǎo': 'jiao3',
'jiào': 'jiao4', 'jie': 'jie5', 'jiē': 'jie1', 'jié': 'jie2', 'jiě': 'jie3', 'jiè': 'jie4', 'jin': 'jin5',
'jīn': 'jin1', 'jín': 'jin2', 'jǐn': 'jin3', 'jìn': 'jin4', 'jing': 'jing5', 'jīng': 'jing1', 'jíng': 'jing2',
'jǐng': 'jing3', 'jìng': 'jing4', 'jiong': 'jiong5', 'jiōng': 'jiong1', 'jióng': 'jiong2', 'jiǒng': 'jiong3',
'jiòng': 'jiong4', 'jiu': 'jiu5', 'jiū': 'jiu1', 'jiú': 'jiu2', 'jiǔ': 'jiu3', 'jiù': 'jiu4', 'ju': 'ju5',
'': 'ju1', '': 'ju2', '': 'ju3', '': 'ju4', 'juan': 'juan5', 'juān': 'juan1', 'juán': 'juan2',
'juǎn': 'juan3', 'juàn': 'juan4', 'jue': 'jue5', 'juē': 'jue1', 'jué': 'jue2', 'juě': 'jue3', 'juè': 'jue4',
'jun': 'jun5', 'jūn': 'jun1', 'jún': 'jun2', 'jǔn': 'jun3', 'jùn': 'jun4', 'ka': 'ka5', '': 'ka1', '': 'ka2',
'': 'ka3', '': 'ka4', 'kai': 'kai5', 'kāi': 'kai1', 'kái': 'kai2', 'kǎi': 'kai3', 'kài': 'kai4', 'kan': 'kan5',
'kān': 'kan1', 'kán': 'kan2', 'kǎn': 'kan3', 'kàn': 'kan4', 'kang': 'kang5', 'kāng': 'kang1', 'káng': 'kang2',
'kǎng': 'kang3', 'kàng': 'kang4', 'kao': 'kao5', 'kāo': 'kao1', 'káo': 'kao2', 'kǎo': 'kao3', 'kào': 'kao4',
'ke': 'ke5', '': 'ke1', '': 'ke2', '': 'ke3', '': 'ke4', 'ken': 'ken5', 'kēn': 'ken1', 'kén': 'ken2',
'kěn': 'ken3', 'kèn': 'ken4', 'keng': 'keng5', 'kēng': 'keng1', 'kéng': 'keng2', 'kěng': 'keng3', 'kèng': 'keng4',
'kong': 'kong5', 'kōng': 'kong1', 'kóng': 'kong2', 'kǒng': 'kong3', 'kòng': 'kong4', 'kou': 'kou5', 'kōu': 'kou1',
'kóu': 'kou2', 'kǒu': 'kou3', 'kòu': 'kou4', 'ku': 'ku5', '': 'ku1', '': 'ku2', '': 'ku3', '': 'ku4',
'kua': 'kua5', 'kuā': 'kua1', 'kuá': 'kua2', 'kuǎ': 'kua3', 'kuà': 'kua4', 'kuai': 'kuai5', 'kuāi': 'kuai1',
'kuái': 'kuai2', 'kuǎi': 'kuai3', 'kuài': 'kuai4', 'kuan': 'kuan5', 'kuān': 'kuan1', 'kuán': 'kuan2',
'kuǎn': 'kuan3', 'kuàn': 'kuan4', 'kuang': 'kuang5', 'kuāng': 'kuang1', 'kuáng': 'kuang2', 'kuǎng': 'kuang3',
'kuàng': 'kuang4', 'kui': 'kui5', 'kuī': 'kui1', 'kuí': 'kui2', 'kuǐ': 'kui3', 'kuì': 'kui4', 'kun': 'kun5',
'kūn': 'kun1', 'kún': 'kun2', 'kǔn': 'kun3', 'kùn': 'kun4', 'kuo': 'kuo5', 'kuō': 'kuo1', 'kuó': 'kuo2',
'kuǒ': 'kuo3', 'kuò': 'kuo4', 'la': 'la5', '': 'la1', '': 'la2', '': 'la3', '': 'la4', 'lai': 'lai5',
'lāi': 'lai1', 'lái': 'lai2', 'lǎi': 'lai3', 'lài': 'lai4', 'lan': 'lan5', 'lān': 'lan1', 'lán': 'lan2',
'lǎn': 'lan3', 'làn': 'lan4', 'lang': 'lang5', 'lāng': 'lang1', 'láng': 'lang2', 'lǎng': 'lang3', 'làng': 'lang4',
'lao': 'lao5', 'lāo': 'lao1', 'láo': 'lao2', 'lǎo': 'lao3', 'lào': 'lao4', 'le': 'le5', '': 'le1', '': 'le2',
'': 'le3', '': 'le4', 'lei': 'lei5', 'lēi': 'lei1', 'léi': 'lei2', 'lěi': 'lei3', 'lèi': 'lei4',
'leng': 'leng5', 'lēng': 'leng1', 'léng': 'leng2', 'lěng': 'leng3', 'lèng': 'leng4', 'li': 'li5', '': 'li1',
'': 'li2', '': 'li3', '': 'li4', 'lia': 'lia5', 'liā': 'lia1', 'liá': 'lia2', 'liǎ': 'lia3', 'lià': 'lia4',
'lian': 'lian5', 'liān': 'lian1', 'lián': 'lian2', 'liǎn': 'lian3', 'liàn': 'lian4', 'liang': 'liang5',
'liāng': 'liang1', 'liáng': 'liang2', 'liǎng': 'liang3', 'liàng': 'liang4', 'liao': 'liao5', 'liāo': 'liao1',
'liáo': 'liao2', 'liǎo': 'liao3', 'liào': 'liao4', 'lie': 'lie5', 'liē': 'lie1', 'lié': 'lie2', 'liě': 'lie3',
'liè': 'lie4', 'lin': 'lin5', 'līn': 'lin1', 'lín': 'lin2', 'lǐn': 'lin3', 'lìn': 'lin4', 'ling': 'ling5',
'līng': 'ling1', 'líng': 'ling2', 'lǐng': 'ling3', 'lìng': 'ling4', 'liu': 'liu5', 'liū': 'liu1', 'liú': 'liu2',
'liǔ': 'liu3', 'liù': 'liu4', 'lo': 'lo5', '': 'lo1', '': 'lo2', '': 'lo3', '': 'lo4', 'long': 'long5',
'lōng': 'long1', 'lóng': 'long2', 'lǒng': 'long3', 'lòng': 'long4', 'lou': 'lou5', 'lōu': 'lou1', 'lóu': 'lou2',
'lǒu': 'lou3', 'lòu': 'lou4', 'lu': 'lu5', '': 'lu1', '': 'lu2', '': 'lu3', '': 'lu4', 'luan': 'luan5',
'luān': 'luan1', 'luán': 'luan2', 'luǎn': 'luan3', 'luàn': 'luan4', 'lun': 'lun5', 'lūn': 'lun1', 'lún': 'lun2',
'lǔn': 'lun3', 'lùn': 'lun4', 'luo': 'luo5', 'luō': 'luo1', 'luó': 'luo2', 'luǒ': 'luo3', 'luò': 'luo4',
'': 'lv5', '': 'lv1', '': 'lv2', '': 'lv3', '': 'lv4', 'lüe': 'lve5', 'lüē': 'lve1', 'lüé': 'lve2',
'lüě': 'lve3', 'lüè': 'lve4', 'ma': 'ma5', '': 'ma1', '': 'ma2', '': 'ma3', '': 'ma4', 'mai': 'mai5',
'māi': 'mai1', 'mái': 'mai2', 'mǎi': 'mai3', 'mài': 'mai4', 'man': 'man5', 'mān': 'man1', 'mán': 'man2',
'mǎn': 'man3', 'màn': 'man4', 'mang': 'mang5', 'māng': 'mang1', 'máng': 'mang2', 'mǎng': 'mang3', 'màng': 'mang4',
'mao': 'mao5', 'māo': 'mao1', 'máo': 'mao2', 'mǎo': 'mao3', 'mào': 'mao4', 'me': 'me5', '': 'me1', '': 'me2',
'': 'me3', '': 'me4', 'mei': 'mei5', 'mēi': 'mei1', 'méi': 'mei2', 'měi': 'mei3', 'mèi': 'mei4', 'men': 'men5',
'mēn': 'men1', 'mén': 'men2', 'měn': 'men3', 'mèn': 'men4', 'meng': 'meng5', 'mēng': 'meng1', 'méng': 'meng2',
'měng': 'meng3', 'mèng': 'meng4', 'mi': 'mi5', '': 'mi1', '': 'mi2', '': 'mi3', '': 'mi4', 'mian': 'mian5',
'miān': 'mian1', 'mián': 'mian2', 'miǎn': 'mian3', 'miàn': 'mian4', 'miao': 'miao5', 'miāo': 'miao1',
'miáo': 'miao2', 'miǎo': 'miao3', 'miào': 'miao4', 'mie': 'mie5', 'miē': 'mie1', 'mié': 'mie2', 'miě': 'mie3',
'miè': 'mie4', 'min': 'min5', 'mīn': 'min1', 'mín': 'min2', 'mǐn': 'min3', 'mìn': 'min4', 'ming': 'ming5',
'mīng': 'ming1', 'míng': 'ming2', 'mǐng': 'ming3', 'mìng': 'ming4', 'miu': 'miu5', 'miū': 'miu1', 'miú': 'miu2',
'miǔ': 'miu3', 'miù': 'miu4', 'mo': 'mo5', '': 'mo1', '': 'mo2', '': 'mo3', '': 'mo4', 'mou': 'mou5',
'mōu': 'mou1', 'móu': 'mou2', 'mǒu': 'mou3', 'mòu': 'mou4', 'mu': 'mu5', '': 'mu1', '': 'mu2', '': 'mu3',
'': 'mu4', 'na': 'na5', '': 'na1', '': 'na2', '': 'na3', '': 'na4', 'nai': 'nai5', 'nāi': 'nai1',
'nái': 'nai2', 'nǎi': 'nai3', 'nài': 'nai4', 'nan': 'nan5', 'nān': 'nan1', 'nán': 'nan2', 'nǎn': 'nan3',
'nàn': 'nan4', 'nang': 'nang5', 'nāng': 'nang1', 'náng': 'nang2', 'nǎng': 'nang3', 'nàng': 'nang4', 'nao': 'nao5',
'nāo': 'nao1', 'náo': 'nao2', 'nǎo': 'nao3', 'nào': 'nao4', 'ne': 'ne5', '': 'ne1', '': 'ne2', '': 'ne3',
'': 'ne4', 'nei': 'nei5', 'nēi': 'nei1', 'néi': 'nei2', 'něi': 'nei3', 'nèi': 'nei4', 'nen': 'nen5',
'nēn': 'nen1', 'nén': 'nen2', 'něn': 'nen3', 'nèn': 'nen4', 'neng': 'neng5', 'nēng': 'neng1', 'néng': 'neng2',
'něng': 'neng3', 'nèng': 'neng4', 'ni': 'ni5', '': 'ni1', '': 'ni2', '': 'ni3', '': 'ni4', 'nian': 'nian5',
'niān': 'nian1', 'nián': 'nian2', 'niǎn': 'nian3', 'niàn': 'nian4', 'niang': 'niang5', 'niāng': 'niang1',
'niáng': 'niang2', 'niǎng': 'niang3', 'niàng': 'niang4', 'niao': 'niao5', 'niāo': 'niao1', 'niáo': 'niao2',
'niǎo': 'niao3', 'niào': 'niao4', 'nie': 'nie5', 'niē': 'nie1', 'nié': 'nie2', 'niě': 'nie3', 'niè': 'nie4',
'nin': 'nin5', 'nīn': 'nin1', 'nín': 'nin2', 'nǐn': 'nin3', 'nìn': 'nin4', 'ning': 'ning5', 'nīng': 'ning1',
'níng': 'ning2', 'nǐng': 'ning3', 'nìng': 'ning4', 'niu': 'niu5', 'niū': 'niu1', 'niú': 'niu2', 'niǔ': 'niu3',
'niù': 'niu4', 'nong': 'nong5', 'nōng': 'nong1', 'nóng': 'nong2', 'nǒng': 'nong3', 'nòng': 'nong4', 'nou': 'nou5',
'nōu': 'nou1', 'nóu': 'nou2', 'nǒu': 'nou3', 'nòu': 'nou4', 'nu': 'nu5', '': 'nu1', '': 'nu2', '': 'nu3',
'': 'nu4', 'nuan': 'nuan5', 'nuān': 'nuan1', 'nuán': 'nuan2', 'nuǎn': 'nuan3', 'nuàn': 'nuan4', 'nuo': 'nuo5',
'nuō': 'nuo1', 'nuó': 'nuo2', 'nuǒ': 'nuo3', 'nuò': 'nuo4', '': 'nv5', '': 'nv1', '': 'nv2', '': 'nv3',
'': 'nv4', 'nüe': 'nve5', 'nüē': 'nve1', 'nüé': 'nve2', 'nüě': 'nve3', 'nüè': 'nve4', 'o': 'o5', 'ō': 'o1',
'ó': 'o2', 'ǒ': 'o3', 'ò': 'o4', 'ou': 'ou5', 'ōu': 'ou1', 'óu': 'ou2', 'ǒu': 'ou3', 'òu': 'ou4', 'pa': 'pa5',
'': 'pa1', '': 'pa2', '': 'pa3', '': 'pa4', 'pai': 'pai5', 'pāi': 'pai1', 'pái': 'pai2', 'pǎi': 'pai3',
'pài': 'pai4', 'pan': 'pan5', 'pān': 'pan1', 'pán': 'pan2', 'pǎn': 'pan3', 'pàn': 'pan4', 'pang': 'pang5',
'pāng': 'pang1', 'páng': 'pang2', 'pǎng': 'pang3', 'pàng': 'pang4', 'pao': 'pao5', 'pāo': 'pao1', 'páo': 'pao2',
'pǎo': 'pao3', 'pào': 'pao4', 'pei': 'pei5', 'pēi': 'pei1', 'péi': 'pei2', 'pěi': 'pei3', 'pèi': 'pei4',
'pen': 'pen5', 'pēn': 'pen1', 'pén': 'pen2', 'pěn': 'pen3', 'pèn': 'pen4', 'peng': 'peng5', 'pēng': 'peng1',
'péng': 'peng2', 'pěng': 'peng3', 'pèng': 'peng4', 'pi': 'pi5', '': 'pi1', '': 'pi2', '': 'pi3', '': 'pi4',
'pian': 'pian5', 'piān': 'pian1', 'pián': 'pian2', 'piǎn': 'pian3', 'piàn': 'pian4', 'piao': 'piao5',
'piāo': 'piao1', 'piáo': 'piao2', 'piǎo': 'piao3', 'piào': 'piao4', 'pie': 'pie5', 'piē': 'pie1', 'pié': 'pie2',
'piě': 'pie3', 'piè': 'pie4', 'pin': 'pin5', 'pīn': 'pin1', 'pín': 'pin2', 'pǐn': 'pin3', 'pìn': 'pin4',
'ping': 'ping5', 'pīng': 'ping1', 'píng': 'ping2', 'pǐng': 'ping3', 'pìng': 'ping4', 'po': 'po5', '': 'po1',
'': 'po2', '': 'po3', '': 'po4', 'pou': 'pou5', 'pōu': 'pou1', 'póu': 'pou2', 'pǒu': 'pou3', 'pòu': 'pou4',
'pu': 'pu5', '': 'pu1', '': 'pu2', '': 'pu3', '': 'pu4', 'qi': 'qi5', '': 'qi1', '': 'qi2', '': 'qi3',
'': 'qi4', 'qia': 'qia5', 'qiā': 'qia1', 'qiá': 'qia2', 'qiǎ': 'qia3', 'qià': 'qia4', 'qian': 'qian5',
'qiān': 'qian1', 'qián': 'qian2', 'qiǎn': 'qian3', 'qiàn': 'qian4', 'qiang': 'qiang5', 'qiāng': 'qiang1',
'qiáng': 'qiang2', 'qiǎng': 'qiang3', 'qiàng': 'qiang4', 'qiao': 'qiao5', 'qiāo': 'qiao1', 'qiáo': 'qiao2',
'qiǎo': 'qiao3', 'qiào': 'qiao4', 'qie': 'qie5', 'qiē': 'qie1', 'qié': 'qie2', 'qiě': 'qie3', 'qiè': 'qie4',
'qin': 'qin5', 'qīn': 'qin1', 'qín': 'qin2', 'qǐn': 'qin3', 'qìn': 'qin4', 'qing': 'qing5', 'qīng': 'qing1',
'qíng': 'qing2', 'qǐng': 'qing3', 'qìng': 'qing4', 'qiong': 'qiong5', 'qiōng': 'qiong1', 'qióng': 'qiong2',
'qiǒng': 'qiong3', 'qiòng': 'qiong4', 'qiu': 'qiu5', 'qiū': 'qiu1', 'qiú': 'qiu2', 'qiǔ': 'qiu3', 'qiù': 'qiu4',
'qu': 'qu5', '': 'qu1', '': 'qu2', '': 'qu3', '': 'qu4', 'quan': 'quan5', 'quān': 'quan1', 'quán': 'quan2',
'quǎn': 'quan3', 'quàn': 'quan4', 'que': 'que5', 'quē': 'que1', 'qué': 'que2', 'quě': 'que3', 'què': 'que4',
'qun': 'qun5', 'qūn': 'qun1', 'qún': 'qun2', 'qǔn': 'qun3', 'qùn': 'qun4', 'ran': 'ran5', 'rān': 'ran1',
'rán': 'ran2', 'rǎn': 'ran3', 'ràn': 'ran4', 'rang': 'rang5', 'rāng': 'rang1', 'ráng': 'rang2', 'rǎng': 'rang3',
'ràng': 'rang4', 'rao': 'rao5', 'rāo': 'rao1', 'ráo': 'rao2', 'rǎo': 'rao3', 'rào': 'rao4', 're': 're5',
'': 're1', '': 're2', '': 're3', '': 're4', 'ren': 'ren5', 'rēn': 'ren1', 'rén': 'ren2', 'rěn': 'ren3',
'rèn': 'ren4', 'reng': 'reng5', 'rēng': 'reng1', 'réng': 'reng2', 'rěng': 'reng3', 'rèng': 'reng4', 'ri': 'ri5',
'': 'ri1', '': 'ri2', '': 'ri3', '': 'ri4', 'rong': 'rong5', 'rōng': 'rong1', 'róng': 'rong2',
'rǒng': 'rong3', 'ròng': 'rong4', 'rou': 'rou5', 'rōu': 'rou1', 'róu': 'rou2', 'rǒu': 'rou3', 'ròu': 'rou4',
'ru': 'ru5', '': 'ru1', '': 'ru2', '': 'ru3', '': 'ru4', 'ruan': 'ruan5', 'ruān': 'ruan1', 'ruán': 'ruan2',
'ruǎn': 'ruan3', 'ruàn': 'ruan4', 'rui': 'rui5', 'ruī': 'rui1', 'ruí': 'rui2', 'ruǐ': 'rui3', 'ruì': 'rui4',
'run': 'run5', 'rūn': 'run1', 'rún': 'run2', 'rǔn': 'run3', 'rùn': 'run4', 'ruo': 'ruo5', 'ruō': 'ruo1',
'ruó': 'ruo2', 'ruǒ': 'ruo3', 'ruò': 'ruo4', 'sa': 'sa5', '': 'sa1', '': 'sa2', '': 'sa3', '': 'sa4',
'sai': 'sai5', 'sāi': 'sai1', 'sái': 'sai2', 'sǎi': 'sai3', 'sài': 'sai4', 'san': 'san5', 'sān': 'san1',
'sán': 'san2', 'sǎn': 'san3', 'sàn': 'san4', 'sang': 'sang5', 'sāng': 'sang1', 'sáng': 'sang2', 'sǎng': 'sang3',
'sàng': 'sang4', 'sao': 'sao5', 'sāo': 'sao1', 'sáo': 'sao2', 'sǎo': 'sao3', 'sào': 'sao4', 'se': 'se5',
'': 'se1', '': 'se2', '': 'se3', '': 'se4', 'sen': 'sen5', 'sēn': 'sen1', 'sén': 'sen2', 'sěn': 'sen3',
'sèn': 'sen4', 'seng': 'seng5', 'sēng': 'seng1', 'séng': 'seng2', 'sěng': 'seng3', 'sèng': 'seng4', 'sha': 'sha5',
'shā': 'sha1', 'shá': 'sha2', 'shǎ': 'sha3', 'shà': 'sha4', 'shai': 'shai5', 'shāi': 'shai1', 'shái': 'shai2',
'shǎi': 'shai3', 'shài': 'shai4', 'shan': 'shan5', 'shān': 'shan1', 'shán': 'shan2', 'shǎn': 'shan3',
'shàn': 'shan4', 'shang': 'shang5', 'shāng': 'shang1', 'sháng': 'shang2', 'shǎng': 'shang3', 'shàng': 'shang4',
'shao': 'shao5', 'shāo': 'shao1', 'sháo': 'shao2', 'shǎo': 'shao3', 'shào': 'shao4', 'she': 'she5', 'shē': 'she1',
'shé': 'she2', 'shě': 'she3', 'shè': 'she4', 'shei': 'shei5', 'shēi': 'shei1', 'shéi': 'shei2', 'shěi': 'shei3',
'shèi': 'shei4', 'shen': 'shen5', 'shēn': 'shen1', 'shén': 'shen2', 'shěn': 'shen3', 'shèn': 'shen4',
'sheng': 'sheng5', 'shēng': 'sheng1', 'shéng': 'sheng2', 'shěng': 'sheng3', 'shèng': 'sheng4', 'shi': 'shi5',
'shī': 'shi1', 'shí': 'shi2', 'shǐ': 'shi3', 'shì': 'shi4', 'shou': 'shou5', 'shōu': 'shou1', 'shóu': 'shou2',
'shǒu': 'shou3', 'shòu': 'shou4', 'shu': 'shu5', 'shū': 'shu1', 'shú': 'shu2', 'shǔ': 'shu3', 'shù': 'shu4',
'shua': 'shua5', 'shuā': 'shua1', 'shuá': 'shua2', 'shuǎ': 'shua3', 'shuà': 'shua4', 'shuai': 'shuai5',
'shuāi': 'shuai1', 'shuái': 'shuai2', 'shuǎi': 'shuai3', 'shuài': 'shuai4', 'shuan': 'shuan5', 'shuān': 'shuan1',
'shuán': 'shuan2', 'shuǎn': 'shuan3', 'shuàn': 'shuan4', 'shuang': 'shuang5', 'shuāng': 'shuang1',
'shuáng': 'shuang2', 'shuǎng': 'shuang3', 'shuàng': 'shuang4', 'shui': 'shui5', 'shuī': 'shui1', 'shuí': 'shui2',
'shuǐ': 'shui3', 'shuì': 'shui4', 'shun': 'shun5', 'shūn': 'shun1', 'shún': 'shun2', 'shǔn': 'shun3',
'shùn': 'shun4', 'shuo': 'shuo5', 'shuō': 'shuo1', 'shuó': 'shuo2', 'shuǒ': 'shuo3', 'shuò': 'shuo4', 'si': 'si5',
'': 'si1', '': 'si2', '': 'si3', '': 'si4', 'song': 'song5', 'sōng': 'song1', 'sóng': 'song2',
'sǒng': 'song3', 'sòng': 'song4', 'sou': 'sou5', 'sōu': 'sou1', 'sóu': 'sou2', 'sǒu': 'sou3', 'sòu': 'sou4',
'su': 'su5', '': 'su1', '': 'su2', '': 'su3', '': 'su4', 'suan': 'suan5', 'suān': 'suan1', 'suán': 'suan2',
'suǎn': 'suan3', 'suàn': 'suan4', 'sui': 'sui5', 'suī': 'sui1', 'suí': 'sui2', 'suǐ': 'sui3', 'suì': 'sui4',
'sun': 'sun5', 'sūn': 'sun1', 'sún': 'sun2', 'sǔn': 'sun3', 'sùn': 'sun4', 'suo': 'suo5', 'suō': 'suo1',
'suó': 'suo2', 'suǒ': 'suo3', 'suò': 'suo4', 'ta': 'ta5', '': 'ta1', '': 'ta2', '': 'ta3', '': 'ta4',
'tai': 'tai5', 'tāi': 'tai1', 'tái': 'tai2', 'tǎi': 'tai3', 'tài': 'tai4', 'tan': 'tan5', 'tān': 'tan1',
'tán': 'tan2', 'tǎn': 'tan3', 'tàn': 'tan4', 'tang': 'tang5', 'tāng': 'tang1', 'táng': 'tang2', 'tǎng': 'tang3',
'tàng': 'tang4', 'tao': 'tao5', 'tāo': 'tao1', 'táo': 'tao2', 'tǎo': 'tao3', 'tào': 'tao4', 'te': 'te5',
'': 'te1', '': 'te2', '': 'te3', '': 'te4', 'teng': 'teng5', 'tēng': 'teng1', 'téng': 'teng2',
'těng': 'teng3', 'tèng': 'teng4', 'ti': 'ti5', '': 'ti1', '': 'ti2', '': 'ti3', '': 'ti4', 'tian': 'tian5',
'tiān': 'tian1', 'tián': 'tian2', 'tiǎn': 'tian3', 'tiàn': 'tian4', 'tiao': 'tiao5', 'tiāo': 'tiao1',
'tiáo': 'tiao2', 'tiǎo': 'tiao3', 'tiào': 'tiao4', 'tie': 'tie5', 'tiē': 'tie1', 'tié': 'tie2', 'tiě': 'tie3',
'tiè': 'tie4', 'ting': 'ting5', 'tīng': 'ting1', 'tíng': 'ting2', 'tǐng': 'ting3', 'tìng': 'ting4', 'tong': 'tong5',
'tōng': 'tong1', 'tóng': 'tong2', 'tǒng': 'tong3', 'tòng': 'tong4', 'tou': 'tou5', 'tōu': 'tou1', 'tóu': 'tou2',
'tǒu': 'tou3', 'tòu': 'tou4', 'tu': 'tu5', '': 'tu1', '': 'tu2', '': 'tu3', '': 'tu4', 'tuan': 'tuan5',
'tuān': 'tuan1', 'tuán': 'tuan2', 'tuǎn': 'tuan3', 'tuàn': 'tuan4', 'tui': 'tui5', 'tuī': 'tui1', 'tuí': 'tui2',
'tuǐ': 'tui3', 'tuì': 'tui4', 'tun': 'tun5', 'tūn': 'tun1', 'tún': 'tun2', 'tǔn': 'tun3', 'tùn': 'tun4',
'tuo': 'tuo5', 'tuō': 'tuo1', 'tuó': 'tuo2', 'tuǒ': 'tuo3', 'tuò': 'tuo4', 'wa': 'wa5', '': 'wa1', '': 'wa2',
'': 'wa3', '': 'wa4', 'wai': 'wai5', 'wāi': 'wai1', 'wái': 'wai2', 'wǎi': 'wai3', 'wài': 'wai4', 'wan': 'wan5',
'wān': 'wan1', 'wán': 'wan2', 'wǎn': 'wan3', 'wàn': 'wan4', 'wang': 'wang5', 'wāng': 'wang1', 'wáng': 'wang2',
'wǎng': 'wang3', 'wàng': 'wang4', 'wei': 'wei5', 'wēi': 'wei1', 'wéi': 'wei2', 'wěi': 'wei3', 'wèi': 'wei4',
'wen': 'wen5', 'wēn': 'wen1', 'wén': 'wen2', 'wěn': 'wen3', 'wèn': 'wen4', 'weng': 'weng5', 'wēng': 'weng1',
'wéng': 'weng2', 'wěng': 'weng3', 'wèng': 'weng4', 'wo': 'wo5', '': 'wo1', '': 'wo2', '': 'wo3', '': 'wo4',
'wu': 'wu5', '': 'wu1', '': 'wu2', '': 'wu3', '': 'wu4', 'xi': 'xi5', '': 'xi1', '': 'xi2', '': 'xi3',
'': 'xi4', 'xia': 'xia5', 'xiā': 'xia1', 'xiá': 'xia2', 'xiǎ': 'xia3', 'xià': 'xia4', 'xian': 'xian5',
'xiān': 'xian1', 'xián': 'xian2', 'xiǎn': 'xian3', 'xiàn': 'xian4', 'xiang': 'xiang5', 'xiāng': 'xiang1',
'xiáng': 'xiang2', 'xiǎng': 'xiang3', 'xiàng': 'xiang4', 'xiao': 'xiao5', 'xiāo': 'xiao1', 'xiáo': 'xiao2',
'xiǎo': 'xiao3', 'xiào': 'xiao4', 'xie': 'xie5', 'xiē': 'xie1', 'xié': 'xie2', 'xiě': 'xie3', 'xiè': 'xie4',
'xin': 'xin5', 'xīn': 'xin1', 'xín': 'xin2', 'xǐn': 'xin3', 'xìn': 'xin4', 'xing': 'xing5', 'xīng': 'xing1',
'xíng': 'xing2', 'xǐng': 'xing3', 'xìng': 'xing4', 'xiong': 'xiong5', 'xiōng': 'xiong1', 'xióng': 'xiong2',
'xiǒng': 'xiong3', 'xiòng': 'xiong4', 'xiu': 'xiu5', 'xiū': 'xiu1', 'xiú': 'xiu2', 'xiǔ': 'xiu3', 'xiù': 'xiu4',
'xu': 'xu5', '': 'xu1', '': 'xu2', '': 'xu3', '': 'xu4', 'xuan': 'xuan5', 'xuān': 'xuan1', 'xuán': 'xuan2',
'xuǎn': 'xuan3', 'xuàn': 'xuan4', 'xue': 'xue5', 'xuē': 'xue1', 'xué': 'xue2', 'xuě': 'xue3', 'xuè': 'xue4',
'xun': 'xun5', 'xūn': 'xun1', 'xún': 'xun2', 'xǔn': 'xun3', 'xùn': 'xun4', 'ya': 'ya5', '': 'ya1', '': 'ya2',
'': 'ya3', '': 'ya4', 'yan': 'yan5', 'yān': 'yan1', 'yán': 'yan2', 'yǎn': 'yan3', 'yàn': 'yan4',
'yang': 'yang5', 'yāng': 'yang1', 'yáng': 'yang2', 'yǎng': 'yang3', 'yàng': 'yang4', 'yao': 'yao5', 'yāo': 'yao1',
'yáo': 'yao2', 'yǎo': 'yao3', 'yào': 'yao4', 'ye': 'ye5', '': 'ye1', '': 'ye2', '': 'ye3', '': 'ye4',
'yi': 'yi5', '': 'yi1', '': 'yi2', '': 'yi3', '': 'yi4', 'yin': 'yin5', 'yīn': 'yin1', 'yín': 'yin2',
'yǐn': 'yin3', 'yìn': 'yin4', 'ying': 'ying5', 'yīng': 'ying1', 'yíng': 'ying2', 'yǐng': 'ying3', 'yìng': 'ying4',
'yo': 'yo5', '': 'yo1', '': 'yo2', '': 'yo3', '': 'yo4', 'yong': 'yong5', 'yōng': 'yong1', 'yóng': 'yong2',
'yǒng': 'yong3', 'yòng': 'yong4', 'you': 'you5', 'yōu': 'you1', 'yóu': 'you2', 'yǒu': 'you3', 'yòu': 'you4',
'yu': 'yu5', '': 'yu1', '': 'yu2', '': 'yu3', '': 'yu4', 'yuan': 'yuan5', 'yuān': 'yuan1', 'yuán': 'yuan2',
'yuǎn': 'yuan3', 'yuàn': 'yuan4', 'yue': 'yue5', 'yuē': 'yue1', 'yué': 'yue2', 'yuě': 'yue3', 'yuè': 'yue4',
'yun': 'yun5', 'yūn': 'yun1', 'yún': 'yun2', 'yǔn': 'yun3', 'yùn': 'yun4', 'za': 'za5', '': 'za1', '': 'za2',
'': 'za3', '': 'za4', 'zai': 'zai5', 'zāi': 'zai1', 'zái': 'zai2', 'zǎi': 'zai3', 'zài': 'zai4', 'zan': 'zan5',
'zān': 'zan1', 'zán': 'zan2', 'zǎn': 'zan3', 'zàn': 'zan4', 'zang': 'zang5', 'zāng': 'zang1', 'záng': 'zang2',
'zǎng': 'zang3', 'zàng': 'zang4', 'zao': 'zao5', 'zāo': 'zao1', 'záo': 'zao2', 'zǎo': 'zao3', 'zào': 'zao4',
'ze': 'ze5', '': 'ze1', '': 'ze2', '': 'ze3', '': 'ze4', 'zei': 'zei5', 'zēi': 'zei1', 'zéi': 'zei2',
'zěi': 'zei3', 'zèi': 'zei4', 'zen': 'zen5', 'zēn': 'zen1', 'zén': 'zen2', 'zěn': 'zen3', 'zèn': 'zen4',
'zeng': 'zeng5', 'zēng': 'zeng1', 'zéng': 'zeng2', 'zěng': 'zeng3', 'zèng': 'zeng4', 'zha': 'zha5', 'zhā': 'zha1',
'zhá': 'zha2', 'zhǎ': 'zha3', 'zhà': 'zha4', 'zhai': 'zhai5', 'zhāi': 'zhai1', 'zhái': 'zhai2', 'zhǎi': 'zhai3',
'zhài': 'zhai4', 'zhan': 'zhan5', 'zhān': 'zhan1', 'zhán': 'zhan2', 'zhǎn': 'zhan3', 'zhàn': 'zhan4',
'zhang': 'zhang5', 'zhāng': 'zhang1', 'zháng': 'zhang2', 'zhǎng': 'zhang3', 'zhàng': 'zhang4', 'zhao': 'zhao5',
'zhāo': 'zhao1', 'zháo': 'zhao2', 'zhǎo': 'zhao3', 'zhào': 'zhao4', 'zhe': 'zhe5', 'zhē': 'zhe1', 'zhé': 'zhe2',
'zhě': 'zhe3', 'zhè': 'zhe4', 'zhen': 'zhen5', 'zhēn': 'zhen1', 'zhén': 'zhen2', 'zhěn': 'zhen3', 'zhèn': 'zhen4',
'zheng': 'zheng5', 'zhēng': 'zheng1', 'zhéng': 'zheng2', 'zhěng': 'zheng3', 'zhèng': 'zheng4', 'zhi': 'zhi5',
'zhī': 'zhi1', 'zhí': 'zhi2', 'zhǐ': 'zhi3', 'zhì': 'zhi4', 'zhong': 'zhong5', 'zhōng': 'zhong1', 'zhóng': 'zhong2',
'zhǒng': 'zhong3', 'zhòng': 'zhong4', 'zhou': 'zhou5', 'zhōu': 'zhou1', 'zhóu': 'zhou2', 'zhǒu': 'zhou3',
'zhòu': 'zhou4', 'zhu': 'zhu5', 'zhū': 'zhu1', 'zhú': 'zhu2', 'zhǔ': 'zhu3', 'zhù': 'zhu4', 'zhua': 'zhua5',
'zhuā': 'zhua1', 'zhuá': 'zhua2', 'zhuǎ': 'zhua3', 'zhuà': 'zhua4', 'zhuai': 'zhuai5', 'zhuāi': 'zhuai1',
'zhuái': 'zhuai2', 'zhuǎi': 'zhuai3', 'zhuài': 'zhuai4', 'zhuan': 'zhuan5', 'zhuān': 'zhuan1', 'zhuán': 'zhuan2',
'zhuǎn': 'zhuan3', 'zhuàn': 'zhuan4', 'zhuang': 'zhuang5', 'zhuāng': 'zhuang1', 'zhuáng': 'zhuang2',
'zhuǎng': 'zhuang3', 'zhuàng': 'zhuang4', 'zhui': 'zhui5', 'zhuī': 'zhui1', 'zhuí': 'zhui2', 'zhuǐ': 'zhui3',
'zhuì': 'zhui4', 'zhun': 'zhun5', 'zhūn': 'zhun1', 'zhún': 'zhun2', 'zhǔn': 'zhun3', 'zhùn': 'zhun4',
'zhuo': 'zhuo5', 'zhuō': 'zhuo1', 'zhuó': 'zhuo2', 'zhuǒ': 'zhuo3', 'zhuò': 'zhuo4', 'zi': 'zi5', '': 'zi1',
'': 'zi2', '': 'zi3', '': 'zi4', 'zong': 'zong5', 'zōng': 'zong1', 'zóng': 'zong2', 'zǒng': 'zong3',
'zòng': 'zong4', 'zou': 'zou5', 'zōu': 'zou1', 'zóu': 'zou2', 'zǒu': 'zou3', 'zòu': 'zou4', 'zu': 'zu5',
'': 'zu1', '': 'zu2', '': 'zu3', '': 'zu4', 'zuan': 'zuan5', 'zuān': 'zuan1', 'zuán': 'zuan2',
'zuǎn': 'zuan3', 'zuàn': 'zuan4', 'zui': 'zui5', 'zuī': 'zui1', 'zuí': 'zui2', 'zuǐ': 'zui3', 'zuì': 'zui4',
'zun': 'zun5', 'zūn': 'zun1', 'zún': 'zun2', 'zǔn': 'zun3', 'zùn': 'zun4', 'zuo': 'zuo5', 'zuō': 'zuo1',
'zuó': 'zuo2', 'zuǒ': 'zuo3', 'zuò': 'zuo4', 'zhei': 'zhei5', 'zhēi': 'zhei1', 'zhéi': 'zhei2', 'zhěi': 'zhei3',
'zhèi': 'zhei4', 'kei': 'kei5', 'kēi': 'kei1', 'kéi': 'kei2', 'kěi': 'kei3', 'kèi': 'kei4', 'tei': 'tei5',
'tēi': 'tei1', 'téi': 'tei2', 'těi': 'tei3', 'tèi': 'tei4', 'len': 'len5', 'lēn': 'len1', 'lén': 'len2',
'lěn': 'len3', 'lèn': 'len4', 'nun': 'nun5', 'nūn': 'nun1', 'nún': 'nun2', 'nǔn': 'nun3', 'nùn': 'nun4',
'nia': 'nia5', 'niā': 'nia1', 'niá': 'nia2', 'niǎ': 'nia3', 'nià': 'nia4', 'rua': 'rua5', 'ruā': 'rua1',
'ruá': 'rua2', 'ruǎ': 'rua3', 'ruà': 'rua4', 'fiao': 'fiao5', 'fiāo': 'fiao1', 'fiáo': 'fiao2', 'fiǎo': 'fiao3',
'fiào': 'fiao4', 'cei': 'cei5', 'cēi': 'cei1', 'céi': 'cei2', 'cěi': 'cei3', 'cèi': 'cei4', 'wong': 'wong5',
'wōng': 'wong1', 'wóng': 'wong2', 'wǒng': 'wong3', 'wòng': 'wong4', 'din': 'din5', 'dīn': 'din1', 'dín': 'din2',
'dǐn': 'din3', 'dìn': 'din4', 'chua': 'chua5', 'chuā': 'chua1', 'chuá': 'chua2', 'chuǎ': 'chua3', 'chuà': 'chua4',
'n': 'n5', 'n1': 'n1', 'ń': 'n2', 'ň': 'n3', 'ǹ': 'n4', 'ng': 'ng5', 'ng1': 'ng1', 'ńg': 'ng2', 'ňg': 'ng3',
'ǹg': 'ng4'}
shengyundiao2guobiao_dict = {v: k for k, v in guobiao2shengyundiao_dict.items()}
def guobiao2shengyundiao(pinyin_list):
"""国标样式拼音转为声母韵母音调样式的拼音。"""
out = []
for pin in pinyin_list:
out.append(guobiao2shengyundiao_dict.get(pin))
return out
def shengyundiao2guobiao(pinyin_list):
"""声母韵母音调样式的拼音转为国标样式的拼音。"""
out = []
for pin in pinyin_list:
out.append(shengyundiao2guobiao_dict.get(pin))
return out
if __name__ == "__main__":
logger.info(__file__)
out = shengyundiao2guobiao('ni2 hao3 a5'.split())
assert out == ['', 'hǎo', 'a']
out = guobiao2shengyundiao(out)
assert out == ['ni2', 'hao3', 'a5']

@ -0,0 +1,78 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/16
"""
#### symbol
音素标记
中文音素简单英文音素简单中文音素
"""
_pad = '_' # 填充符
_eos = '~' # 结束符
_chain = '-' # 连接符,连接读音单位
_oov = '*'
# 中文音素表
# 声母27
_shengmu = [
'aa', 'b', 'c', 'ch', 'd', 'ee', 'f', 'g', 'h', 'ii', 'j', 'k', 'l', 'm', 'n', 'oo', 'p', 'q', 'r', 's', 'sh',
't', 'uu', 'vv', 'x', 'z', 'zh'
]
# 韵母41
_yunmu = [
'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing',
'iong', 'iu', 'ix', 'iy', 'iz', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ueng', 'ui', 'un', 'uo', 'v',
'van', 've', 'vn', 'ng', 'uong'
]
# 声调5
_shengdiao = ['1', '2', '3', '4', '5']
# 字母26
_alphabet = 'Aa Bb Cc Dd Ee Ff Gg Hh Ii Jj Kk Ll Mm Nn Oo Pp Qq Rr Ss Tt Uu Vv Ww Xx Yy Zz'.split()
# 英文26
_english = 'A B C D E F G H I J K L M N O P Q R S T U V W X Y Z'.split()
# 标点10
_biaodian = '! ? . , ; : " # ( )'.split()
# 注:!=!|?=?|.=.。|,=,,、|;=;|:=:|"="“|#= \t|(=([{{【<《|)=)]}}】>》
# 其他7
_other = 'w y 0 6 7 8 9'.split()
# 大写字母26
_upper = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
# 小写字母26
_lower = list('abcdefghijklmnopqrstuvwxyz')
# 标点符号12
_punctuation = list('!\'"(),-.:;? ')
# 数字10
_digit = list('0123456789')
# 字母和符号64
# 用于英文:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'"(),-.:;?\s
_character_en = _upper + _lower + _punctuation
# 字母、数字和符号74
# 用于英文或中文:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'"(),-.:;?\s0123456789
_character_cn = _upper + _lower + _punctuation + _digit
# 中文音素145
# 支持中文环境、英文环境、中英混合环境,中文把文字转为清华大学标准的音素表示
symbol_chinese = [_pad, _eos, _chain] + _shengmu + _yunmu + _shengdiao + _alphabet + _english + _biaodian + _other
# 简单英文音素66
# 支持英文环境
# ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'"(),-.:;?\s
symbol_english_simple = [_pad, _eos] + _upper + _lower + _punctuation
# 简单中文音素76
# 支持英文、中文环境,中文把文字转为拼音字符串
# ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'"(),-.:;?\s0123456789
symbol_chinese_simple = [_pad, _eos] + _upper + _lower + _punctuation + _digit

@ -0,0 +1,19 @@
Copyright (c) 2017 Keith Ito
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

@ -0,0 +1,116 @@
"""
### english
from https://github.com/keithito/tacotron "
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
"""
import re
import random
from . import cleaners
from .symbols import symbols
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
def get_arpabet(word, dictionary):
word_arpabet = dictionary.lookup(word)
if word_arpabet is not None:
return "{" + word_arpabet[0] + "}"
else:
return word
def text_to_sequence(text, cleaner_names, dictionary=None, p_arpabet=1.0):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
Args:
text: string to convert to a sequence
cleaner_names: names of the cleaner functions to run the text through
dictionary: arpabet class with arpabet dictionary
Returns:
List of integers corresponding to the symbols in the text
'''
sequence = []
space = _symbols_to_sequence(' ')
# Check for curly braces and treat their contents as ARPAbet:
while len(text):
m = _curly_re.match(text)
if not m:
clean_text = _clean_text(text, cleaner_names)
if dictionary is not None:
clean_text = [get_arpabet(w, dictionary)
if random.random() < p_arpabet else w
for w in clean_text.split(" ")]
for i in range(len(clean_text)):
t = clean_text[i]
if t.startswith("{"):
sequence += _arpabet_to_sequence(t[1:-1])
else:
sequence += _symbols_to_sequence(t)
sequence += space
else:
sequence += _symbols_to_sequence(clean_text)
break
clean_text = _clean_text(text, cleaner_names)
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)
# remove trailing space
sequence = sequence[:-1] if sequence[-1] == space[0] else sequence
return sequence
def sequence_to_text(sequence):
'''Converts a sequence of IDs back to a string'''
result = []
for symbol_id in sequence:
if symbol_id in _id_to_symbol:
s = _id_to_symbol[symbol_id]
# Enclose ARPAbet back in curly braces:
if len(s) > 1 and s[0] == '@':
s = '{%s}' % s[1:]
result.append(s)
result = ''.join(result)
return result.replace('}{', ' ')
def _clean_text(text, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text)
return text
def _symbols_to_sequence(symbols):
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
def _arpabet_to_sequence(text):
return _symbols_to_sequence(['@' + s for s in text.split()])
def _should_keep_symbol(s):
return s in _symbol_to_id and s is not '_' and s is not '~'

@ -0,0 +1,91 @@
'''
### english
from https://github.com/keithito/tacotron "
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''
import re
from unidecode import unidecode
from .numbers import normalize_numbers
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def expand_numbers(text):
return normalize_numbers(text)
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text)
def convert_to_ascii(text):
return unidecode(text)
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = collapse_whitespace(text)
return text

File diff suppressed because it is too large Load Diff

@ -0,0 +1,65 @@
""" from https://github.com/keithito/tacotron """
import re
valid_symbols = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
]
_valid_symbol_set = set(valid_symbols)
class CMUDict:
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
def __init__(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str):
with open(file_or_path, encoding='latin-1') as f:
entries = _parse_cmudict(f)
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
self._entries = entries
def __len__(self):
return len(self._entries)
def lookup(self, word):
'''Returns list of ARPAbet pronunciations of the given word.'''
return self._entries.get(word.upper())
_alt_re = re.compile(r'\([0-9]+\)')
def _parse_cmudict(file):
cmudict = {}
for line in file:
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
parts = line.split(' ')
word = re.sub(_alt_re, '', parts[0])
pronunciation = _get_pronunciation(parts[1])
if pronunciation:
if word in cmudict:
cmudict[word].append(pronunciation)
else:
cmudict[word] = [pronunciation]
return cmudict
def _get_pronunciation(s):
parts = s.strip().split(' ')
for part in parts:
if part not in _valid_symbol_set:
return None
return ' '.join(parts)

@ -0,0 +1,71 @@
""" from https://github.com/keithito/tacotron """
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')
def _remove_commas(m):
return m.group(1).replace(',', '')
def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ')
def _expand_dollars(m):
match = m.group(1)
parts = match.split('.')
if len(parts) > 2:
return match + ' dollars' # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
elif cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
else:
return 'zero dollars'
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return 'two thousand'
elif num > 2000 and num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
else:
return _inflect.number_to_words(num, andword='')
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text

@ -0,0 +1,21 @@
""" from https://github.com/keithito/tacotron """
'''
Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
from . import cmudict
_punctuation = '!\'",.:;? '
_math = '#%&*+-/[]()'
_special = '_@©°½—₩€$'
_accented = 'áçéêëñöøćž'
_numbers = '0123456789'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
# uppercase letters):
_arpabet = ['@' + s for s in cmudict.valid_symbols]
# Export all symbols:
symbols = list(_punctuation + _math + _special + _accented + _numbers + _letters) + _arpabet

@ -0,0 +1,50 @@
"""
### pinyinkit
文本转拼音的模块依赖python-pinyinjiebaphrase-pinyin-data模块
"""
import re
from pypinyin import lazy_pinyin, Style
# 兼容0.1.0之前的版本。
# 音调5为轻声
_diao_re = re.compile(r"([12345]$)")
def text2pinyin(text, errors=None, **kwargs):
"""
汉语文本转为拼音列表
:param text: str,汉语文本字符串
:param errors: function,对转拼音失败的字符的处理函数默认保留原样
:return: list,拼音列表
"""
if errors is None:
errors = default_errors
pin = lazy_pinyin(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs)
return pin
def default_errors(x):
return list(x)
def split_pinyin(py):
"""
单个拼音转为音素列表
:param py: str,拼音字符串
:param errors: function,对OOV拼音的处理函数默认保留原样
:return: list,音素列表
"""
parts = _diao_re.split(py)
if len(parts) == 1:
fuyuan = py
diao = "5"
else:
fuyuan = parts[0]
diao = parts[1]
return [fuyuan, diao]
if __name__ == "__main__":
print(__file__)
assert text2pinyin("拼音") == ['pin1', 'yin1']
assert text2pinyin("汉字,a1") == ['han4', 'zi4', ',', 'a', '1']

@ -0,0 +1,4 @@
jieba
inflect
unidecode
tqdm

@ -0,0 +1,44 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2019/12/1
"""
local
"""
import logging
logging.basicConfig(level=logging.INFO)
def run_text2phoneme():
from phkit.chinese.sequence import text2phoneme, text2sequence
text = "汉字转音素TTS《Text to speech》。"
# text = "岂有此理"
# text = "我的儿子玩会儿"
out = text2phoneme(text)
print(out)
# ['h', 'an', '4', '-', 'z', 'iy', '4', '-', 'zh', 'uan', '3', '-', 'ii', 'in', '1', '-', 's', 'u', '4', '-', ',',
# 'Tt', 'Tt', 'Ss', ':', '(', 'T', 'E', 'X', 'T', '#', 'T', 'O', '#', 'S', 'P', 'E', 'E', 'C', 'H', ')', '.', '-',
# '~', '_']
out = text2sequence(text)
print(out)
# [11, 32, 76, 2, 28, 51, 76, 2, 29, 59, 75, 2, 12, 46, 73, 2, 22, 56, 76, 2, 133, 97, 97, 96, 135, 138, 123, 108,
# 127, 123, 137, 123, 118, 137, 122, 119, 108, 108, 106, 111, 139, 132, 2, 1, 0]
def run_english():
from phkit.english import text_to_sequence, sequence_to_text
from phkit.english.cmudict import CMUDict
text = "text to speech"
cmupath = 'phkit/english/cmu_dictionary'
cmudict = CMUDict(cmupath)
seq = text_to_sequence(text, cleaner_names=["english_cleaners"], dictionary=cmudict)
print(seq)
txt = sequence_to_text(seq)
print(txt)
if __name__ == "__main__":
print(__file__)
run_text2phoneme()
run_english()

@ -0,0 +1,86 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2019/12/15
"""
语音处理工具箱
生成whl格式安装包python setup.py bdist_wheel
直接上传pypipython setup.py sdist upload
用twine上传pypi
生成安装包python setup.py sdist
上传安装包twine upload dist/phkit-0.0.3.tar.gz
注意需要在home目录下建立.pypirc配置文件文件内容格式
[distutils]
index-servers=pypi
[pypi]
repository = https://upload.pypi.org/legacy/
username: admin
password: admin
"""
from setuptools import setup, find_packages
import os
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(os.path.splitext(os.path.basename(__name__))[0])
install_requires = ['jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode']
requires = install_requires
def create_readme():
from phkit import readme_docs
docs = []
with open("README.md", "wt", encoding="utf8") as fout:
for doc in readme_docs:
fout.write(doc)
docs.append(doc)
return "".join(docs)
def pip_install():
for pkg in install_requires + requires:
try:
os.system("pip install {}".format(pkg))
except Exception as e:
logger.info("pip install {} failed".format(pkg))
pip_install()
phkit_doc = create_readme()
from phkit import __version__ as phkit_version
setup(
name="phkit",
version=phkit_version,
author="kuangdd",
author_email="kuangdd@foxmail.com",
description="phoneme toolkit",
long_description=phkit_doc,
long_description_content_type="text/markdown",
url="https://github.com/KuangDD/phkit",
packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
install_requires=install_requires, # 指定项目最低限度需要运行的依赖项
python_requires='>=3.5', # python的依赖关系
package_data={
'txt': ['requirements.txt'],
'md': ['**/*.md', '*.md'],
}, # 包数据,通常是与软件包实现密切相关的数据
classifiers=[
'Intended Audience :: Developers',
'Topic :: Software Development :: Build Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
"Operating System :: OS Independent",
],
)
if __name__ == "__main__":
print(__file__)

@ -0,0 +1,61 @@
#!usr/bin/env python
# -*- coding: utf-8 -*-
# author: kuangdd
# date: 2020/2/18
"""
"""
def test_phkit():
from phkit import text2phoneme, text2sequence, symbol_chinese
from phkit import chinese_sequence_to_text, chinese_text_to_sequence
text = "汉字转音素TTS《Text to speech》。"
target_ph = ['h', 'an', '4', '-', 'z', 'iy', '4', '-', 'zh', 'uan', '3', '-', 'ii', 'in', '1', '-', 's', 'u', '4',
'-', ',', '-',
'Tt', 'Tt', 'Ss', '-', ':', '-', '(', '-', 'T', 'E', 'X', 'T', '-', '#', '-', 'T', 'O', '-', '#', '-',
'S', 'P', 'E', 'E', 'C', 'H', '-', ')', '-', '.', '-', '~', '_']
result = text2phoneme(text)
assert result == target_ph
target_seq = [11, 32, 74, 2, 28, 51, 74, 2, 29, 59, 73, 2, 12, 46, 71, 2, 22, 56, 74, 2, 131, 2, 95, 95, 94, 2, 133,
2, 136, 2, 121,
106, 125, 121, 2, 135, 2, 121, 116, 2, 135, 2, 120, 117, 106, 106, 104, 109, 2, 137, 2, 130, 2, 1, 0]
result = text2sequence(text)
assert result == target_seq
result = chinese_text_to_sequence(text)
assert result == target_seq
target_ph = ' '.join(target_ph)
result = chinese_sequence_to_text(result)
assert result == target_ph
assert len(symbol_chinese) == 145
text = "岂有此理"
target = ['q', 'i', '2', '-', 'ii', 'iu', '3', '-', 'c', 'iy', '2', '-', 'l', 'i', '3', '-', '~', '_']
result = text2phoneme(text)
assert result == target
text = "我的儿子玩会儿"
target = ['uu', 'uo', '3', '-', 'd', 'e', '5', '-', 'ee', 'er', '2', '-', 'z', 'iy', '5', '-', 'uu', 'uan', '2',
'-', 'h', 'ui', '4', '-', 'ee', 'er', '5', '-', '~', '_']
result = text2phoneme(text)
assert result == target
def test_convert():
from phkit import ban2quan, quan2ban, jian2fan, fan2jian
assert ban2quan("aA1 ,:$。、") == "aA1 ,:$。、"
assert quan2ban("aA1 ,:$。、") == "aA1 ,:$。、"
assert jian2fan("中国语言") == "中國語言"
assert fan2jian("中國語言") == "中国语言"
print(fan2jian("中國語言"))
print(jian2fan("中国语言"))
if __name__ == "__main__":
print(__file__)
test_phkit()
test_convert()

@ -16,6 +16,7 @@ from pypinyin.converter import DefaultConverter
from pypinyin.seg import mmseg
from pypinyin.seg import simpleseg
from pypinyin.utils import (_replace_tone2_style_dict_to_default)
import jieba
TStyle = Style
TErrors = Union[Callable[[Text], Text], Text]
@ -139,7 +140,8 @@ class Pinyin():
:param hans: 分词前的字符串
:return: ``None`` or ``list``
"""
pass
outs = jieba.lcut(hans) # 默认用jieba分词从语义角度分词。
return outs
def post_seg(self, hans: Text, seg_data: List[Text],
**kwargs: Any) -> Optional[List[Text]]:

@ -10,3 +10,4 @@ Sphinx
tox
twine
wheel>=0.21
jieba

@ -17,7 +17,7 @@ packages = [
'pypinyin.style',
]
requirements = []
requirements = ["jieba"]
if sys.version_info[:2] < (3, 4):
requirements.append('enum34')
if sys.version_info[:2] < (3, 5):

@ -5,12 +5,6 @@ script: tox
matrix:
include:
- python: 2.7
env: TOXENV=py27
- python: 3.4
env: TOXENV=py34
- python: 3.5
env: TOXENV=py35
- python: 3.6
env: TOXENV=py36
- python: 3.6

@ -1,14 +0,0 @@
=======
Credits
=======
Author and Maintainer
---------------------
* Thomas Roten <https://github.com/tsroten>
Contributors
------------
None yet. Why not be the first?

@ -1,88 +0,0 @@
Changes
=======
v0.1.0 (2013-05-05)
-------------------
* Initial release
v0.1.1 (2013-05-05)
-------------------
* Adds zhon.cedict package to setup.py
v0.2.0 (2013-05-07)
-------------------
* Allows for mapping between simplified and traditional.
* Adds logging to build_string().
* Adds constants for numbered Pinyin and accented Pinyin.
v0.2.1 (2013-05-07)
-------------------
* Fixes typo in README.rst.
v.1.0.0 (2014-01-25)
--------------------
* Complete rewrite that refactors code, renames constants, and improves Pinyin
support.
v.1.1.0 (2014-01-28)
--------------------
* Adds ``zhon.pinyin.punctuation`` constant.
* Adds ``zhon.pinyin.accented_syllable``, ``zhon.pinyin.accented_word``, and
``zhon.pinyin.accented_sentence`` constants.
* Adds ``zhon.pinyin.numbered_syllable``, ``zhon.pinyin.numbered_word``, and
``zhon.pinyin.numbered_sentence`` constants.
* Fixes some README.rst typos.
* Clarifies information regarding Traditional and Simplified character
constants in README.rst.
* Adds constant short names to README.rst.
v.1.1.1 (2014-01-29)
--------------------
* Adds documentation.
* Adds ``zhon.cedict.all`` constant.
* Removes duplicate code ranges from ``zhon.hanzi.characters``.
* Makes ``zhon.hanzi.non_stops`` a string containing all non-stops instead of
a string containing code ranges.
* Removes duplicate letters in ``zhon.pinyin.consonants``.
* Refactors Pinyin vowels/consonant code.
* Removes the Latin alpha from ``zhon.pinyin.vowels``. Fixes #16.
* Adds ``cjk_ideographs`` alias for ``zhon.hanzi.characters``.
* Fixes various typos.
* Removes numbers from Pinyin word constants. Fixes #15.
* Adds lowercase and uppercase constants to ``zhon.pinyin``.
* Fixes a bug with ``zhon.pinyin.sentence``.
* Adds ``sent`` alias for ``zhon.pinyin.sentence``.
v.1.1.2 (2014-01-31)
--------------------
* Fixes bug with ``zhon.cedict.all``.
v.1.1.3 (2014-02-12)
--------------------
* Adds Ideographic number zero to ``zhon.hanzi.characters``. Fixes #17.
* Fixes r-suffix bug. Fixes #18.
v.1.1.4 (2015-01-25)
--------------------
* Removes duplicate module declarations in documentation.
* Moves tests inside zhon package.
* Adds travis config file.
* Adds Python 3.4 tests to travis and tox.
* Fixes flake8 warnings.
* Adds distutil fallback import statment to setup.py.
* Adds missing hanzi punctuation. Fixes #19.
v.1.1.5 (2016-05-23)
--------------------
* Add missing Zhuyin characters. Fixes #23.

@ -1,107 +0,0 @@
============
Contributing
============
Contributions are welcome, and they are greatly appreciated! Every
little bit helps, and credit will always be given.
You can contribute in many ways:
Types of Contributions
----------------------
Report Bugs
~~~~~~~~~~~
Report bugs at https://github.com/tsroten/zhon/issues.
If you are reporting a bug, please include:
* Your operating system name and version.
* Any details about your local setup that might be helpful in troubleshooting.
* Detailed steps to reproduce the bug.
Fix Bugs
~~~~~~~~
Look through the GitHub issues for bugs. Anything tagged with "bug"
is open to whoever wants to implement it.
Implement Features
~~~~~~~~~~~~~~~~~~
Look through the GitHub issues for features. Anything tagged with "feature"
is open to whoever wants to implement it.
Write Documentation
~~~~~~~~~~~~~~~~~~~
Zhon could always use more documentation, whether as part of the
official Zhon docs, in docstrings, or even on the web in blog posts,
articles, and such.
Submit Feedback
~~~~~~~~~~~~~~~
The best way to send feedback is to file an issue at https://github.com/tsroten/zhon/issues.
If you are proposing a feature:
* Explain in detail how it would work.
* Keep the scope as narrow as possible, to make it easier to implement.
* Remember that this is a volunteer-driven project, and that contributions
are welcome :)
Get Started!
------------
Ready to contribute? Here's how to set up `zhon` for local development.
1. Fork the `zhon` repo on GitHub.
2. Clone your fork locally::
$ git clone git@github.com:your_name_here/zhon.git
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
$ mkvirtualenv zhon
$ cd zhon/
$ python setup.py develop
4. Create a branch for local development::
$ git checkout -b name-of-your-bugfix-or-feature
Now you can make your changes locally.
5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
$ flake8 zhon
$ python setup.py test
$ tox
To get flake8 and tox, just pip install them into your virtualenv.
You can ignore the flake8 errors regarding `zhon.cedict` files. Rather than include hundreds of newline characters in each file, we are ignoring those errors.
6. Commit your changes and push your branch to GitHub::
$ git add .
$ git commit -m "Your detailed description of your changes."
$ git push origin name-of-your-bugfix-or-feature
7. Submit a pull request through the GitHub website.
Pull Request Guidelines
-----------------------
Before you submit a pull request, check that it meets these guidelines:
1. The pull request should include tests.
2. If the pull request adds functionality, the docs should be updated. Put
your new functionality into a function with a docstring, and add the
feature to the list in README.rst.
3. The pull request should work for Python 2.7, 3.3, and 3.4. Check
https://travis-ci.org/tsroten/zhon/pull_requests
and make sure that the tests pass for all supported Python versions.
4. If you want to receive credit, add your name to `AUTHORS.rst`.

@ -1,7 +0,0 @@
Copyright (c) 2013-2014 Thomas Roten
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -61,4 +61,3 @@ Getting Started
* `Install Zhon <http://zhon.readthedocs.org/en/latest/#installation>`_
* Read `Zhon's introduction <http://zhon.readthedocs.org/en/latest/#using-zhon>`_
* Learn from the `API documentation <http://zhon.readthedocs.org/en/latest/#zhon-hanzi>`_
* `Contribute <https://github.com/tsroten/zhon/blob/develop/CONTRIBUTING.rst>`_ documentation, code, or feedback

@ -37,9 +37,6 @@ setup(
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Linguistic',

@ -1,4 +1,4 @@
PYTHON:= python3.8
PYTHON:= python3.7
.PHONY: all clean
all: virtualenv kenlm.done sox.done soxbindings.done

@ -9,7 +9,7 @@ ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -70,8 +71,8 @@ def main():
fout.write(UNK + '\n') # <unk> must be 1
if args.unit_type == 'spm':
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
import sentencepiece as spm

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#

Loading…
Cancel
Save