diff --git a/deepspeech/decoders/swig/setup.py b/deepspeech/decoders/swig/setup.py index 86af475af..3da5ce8bf 100644 --- a/deepspeech/decoders/swig/setup.py +++ b/deepspeech/decoders/swig/setup.py @@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') FILES = [ - fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') - or fn.endswith('unittest.cc')) + fn for fn in FILES + if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( + 'unittest.cc')) ] LIBS = ['stdc++'] diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md new file mode 100644 index 000000000..da56fffc8 --- /dev/null +++ b/examples/thchs30/README.md @@ -0,0 +1,42 @@ +# THCHS-30 数据集强制对齐实验 +----- +本实验对 THCHS-30 中文数据集用 [Montreal-Forced-Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/index.html) 进行强制对齐。 +THCHS-30 的文本标注数据分为: + 1. 汉字级别(word),该数据集用空格对词进行了划分,我们在使用时按照将不同字之间按空格划分 + 2. 音节级别(syllable),即汉语中的一个拼音 + 3. 音素级别(phone),一个拼音有多个音素组成,汉语的声母韵母可以理解为音素,不同的数据集有各自的音素标准,THCHS-30 数据集与标贝 BZNSYP 数据集的音素标准略有不同 + + 数据 A11_0 文本示例如下: +``` +绿 是 阳春 烟 景 大块 文章 的 底色 四月 的 林 峦 更是 绿 得 鲜活 秀媚 诗意 盎然↩ +lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de5 di3 se4 si4 yue4 de5 lin2 luan2 geng4 shi4 lv4 de5 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2↩ +l v4 sh ix4 ii iang2 ch un1 ii ian1 j ing3 d a4 k uai4 uu un2 zh ang1 d e5 d i3 s e4 s iy4 vv ve4 d e5 l in2 l uan2 g eng4 sh ix4 l v4 d e5 x ian1 h uo2 x iu4 m ei4 sh ix1 ii i4 aa ang4 r an2 +``` +## 开始实验 +--- +在本项目的 根目录/tools 执行 +``` +make +``` +下载 MFA 的可执行包(也会同时下载本项目所需的其他工具) +执行如下命令: +``` +cd a0 +./run.sh +``` +应用程序会自动下载 THCHS-30数据集,处理成 MFA 所需的文件格式并开始训练,您可以修改 `run.sh` 中的参数 `LEXICON_NAME` 来决定您需要强制对齐的级别(word、syllable 和 phone) +## MFA 所使用的字典 +--- +MFA 字典的格式请参考: [MFA 官方文档 Dictionary format ](https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html) +phone.lexicon 直接使用的是 `THCHS-30/data_thchs30/lm_phone/lexicon.txt` +word.lexicon 考虑到了中文的多音字,使用**带概率的字典**, 生成规则请参考 `local/gen_word2phone.py` +`syllable.lexicon` 获取自 [DNSun/thchs30-pinyin2tone](https://github.com/DNSun/thchs30-pinyin2tone) +## 对齐结果 +--- +我们提供了三种级别 MFA 训练好的对齐结果、模型和字典(`syllable.lexicon` 在 `data/dict` 中,`phone.lexicon` 和` word.lexicon` 运行数据预处理代码后会自动从原始数据集复制或生成) + +**phone 级别:** [phone.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/phone/phone.lexicon)、 [对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/phone/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/phone/thchs30_model.zip) +**syllabel 级别:** [syllable.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/syllable.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_model.zip) +**word 级别:** [word.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/word.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_model.zip) + +随后,您可以参考 [MFA 官方文档 Align using pretrained models](https://montreal-forced-aligner.readthedocs.io/en/stable/aligning.html#align-using-pretrained-models) 使用我们给您提供好的模型直接对自己的数据集进行强制对齐,注意,您需要使用和模型对应的 lexicon 文件,当文本是汉字时,您需要用空格把不同的**汉字**(而不是词语)分开 diff --git a/examples/thchs30/a0/data/dict/syllable.lexicon b/examples/thchs30/a0/data/dict/syllable.lexicon new file mode 100644 index 000000000..e1da4e04c --- /dev/null +++ b/examples/thchs30/a0/data/dict/syllable.lexicon @@ -0,0 +1,2490 @@ +A0 aa a0 +A1 aa a1 +A2 aa a2 +A3 aa a3 +A4 aa a4 +AI0 aa ai0 +AI1 aa ai1 +AI2 aa ai2 +AI3 aa ai3 +AI4 aa ai4 +AN0 aa an0 +AN1 aa an1 +AN2 aa an2 +AN3 aa an3 +AN4 aa an4 +ANG0 aa ang0 +ANG1 aa ang1 +ANG2 aa ang2 +ANG3 aa ang3 +ANG4 aa ang4 +AO0 aa ao0 +AO1 aa ao1 +AO2 aa ao2 +AO3 aa ao3 +AO4 aa ao4 +BA0 b a0 +BA1 b a1 +BA2 b a2 +BA3 b a3 +BA4 b a4 +BAI0 b ai0 +BAI1 b ai1 +BAI2 b ai2 +BAI3 b ai3 +BAI4 b ai4 +BAN0 b an0 +BAN1 b an1 +BAN2 b an2 +BAN3 b an3 +BAN4 b an4 +BANG0 b ang0 +BANG1 b ang1 +BANG2 b ang2 +BANG3 b ang3 +BANG4 b ang4 +BAO0 b ao0 +BAO1 b ao1 +BAO2 b ao2 +BAO3 b ao3 +BAO4 b ao4 +BEI0 b ei0 +BEI1 b ei1 +BEI2 b ei2 +BEI3 b ei3 +BEI4 b ei4 +BEN0 b en0 +BEN1 b en1 +BEN2 b en2 +BEN3 b en3 +BEN4 b en4 +BENG0 b eng0 +BENG1 b eng1 +BENG2 b eng2 +BENG3 b eng3 +BENG4 b eng4 +BI0 b i0 +BI1 b i1 +BI2 b i2 +BI3 b i3 +BI4 b i4 +BIAN0 b ian0 +BIAN1 b ian1 +BIAN2 b ian2 +BIAN3 b ian3 +BIAN4 b ian4 +BIAO0 b iao0 +BIAO1 b iao1 +BIAO2 b iao2 +BIAO3 b iao3 +BIAO4 b iao4 +BIE0 b ie0 +BIE1 b ie1 +BIE2 b ie2 +BIE3 b ie3 +BIE4 b ie4 +BIN0 b in0 +BIN1 b in1 +BIN2 b in2 +BIN3 b in3 +BIN4 b in4 +BING0 b ing0 +BING1 b ing1 +BING2 b ing2 +BING3 b ing3 +BING4 b ing4 +BO0 b o0 +BO1 b o1 +BO2 b o2 +BO3 b o3 +BO4 b o4 +BU0 b u0 +BU1 b u1 +BU2 b u2 +BU3 b u3 +BU4 b u4 +CA0 c a0 +CA1 c a1 +CA2 c a2 +CA3 c a3 +CA4 c a4 +CAI0 c ai0 +CAI1 c ai1 +CAI2 c ai2 +CAI3 c ai3 +CAI4 c ai4 +CAN0 c an0 +CAN1 c an1 +CAN2 c an2 +CAN3 c an3 +CAN4 c an4 +CANG0 c ang0 +CANG1 c ang1 +CANG2 c ang2 +CANG3 c ang3 +CANG4 c ang4 +CAO0 c ao0 +CAO1 c ao1 +CAO2 c ao2 +CAO3 c ao3 +CAO4 c ao4 +CE0 c e0 +CE1 c e1 +CE2 c e2 +CE3 c e3 +CE4 c e4 +CEN0 c en0 +CEN1 c en1 +CEN2 c en2 +CEN3 c en3 +CEN4 c en4 +CENG0 c eng0 +CENG1 c eng1 +CENG2 c eng2 +CENG3 c eng3 +CENG4 c eng4 +CHA0 ch a0 +CHA1 ch a1 +CHA2 ch a2 +CHA3 ch a3 +CHA4 ch a4 +CHAI0 ch ai0 +CHAI1 ch ai1 +CHAI2 ch ai2 +CHAI3 ch ai3 +CHAI4 ch ai4 +CHAN0 ch an0 +CHAN1 ch an1 +CHAN2 ch an2 +CHAN3 ch an3 +CHAN4 ch an4 +CHANG0 ch ang0 +CHANG1 ch ang1 +CHANG2 ch ang2 +CHANG3 ch ang3 +CHANG4 ch ang4 +CHAO0 ch ao0 +CHAO1 ch ao1 +CHAO2 ch ao2 +CHAO3 ch ao3 +CHAO4 ch ao4 +CHE0 ch e0 +CHE1 ch e1 +CHE2 ch e2 +CHE3 ch e3 +CHE4 ch e4 +CHEN0 ch en0 +CHEN1 ch en1 +CHEN2 ch en2 +CHEN3 ch en3 +CHEN4 ch en4 +CHENG0 ch eng0 +CHENG1 ch eng1 +CHENG2 ch eng2 +CHENG3 ch eng3 +CHENG4 ch eng4 +CHI0 ch ix0 +CHI1 ch ix1 +CHI2 ch ix2 +CHI3 ch ix3 +CHI4 ch ix4 +CHONG0 ch ong0 +CHONG1 ch ong1 +CHONG2 ch ong2 +CHONG3 ch ong3 +CHONG4 ch ong4 +CHOU0 ch ou0 +CHOU1 ch ou1 +CHOU2 ch ou2 +CHOU3 ch ou3 +CHOU4 ch ou4 +CHU0 ch u0 +CHU1 ch u1 +CHU2 ch u2 +CHU3 ch u3 +CHU4 ch u4 +CHUAI0 ch uai0 +CHUAI1 ch uai1 +CHUAI2 ch uai2 +CHUAI3 ch uai3 +CHUAI4 ch uai4 +CHUAN0 ch uan0 +CHUAN1 ch uan1 +CHUAN2 ch uan2 +CHUAN3 ch uan3 +CHUAN4 ch uan4 +CHUANG0 ch uang0 +CHUANG1 ch uang1 +CHUANG2 ch uang2 +CHUANG3 ch uang3 +CHUANG4 ch uang4 +CHUI0 ch ui0 +CHUI1 ch ui1 +CHUI2 ch ui2 +CHUI3 ch ui3 +CHUI4 ch ui4 +CHUN0 ch un0 +CHUN1 ch un1 +CHUN2 ch un2 +CHUN3 ch un3 +CHUN4 ch un4 +CHUO0 ch uo0 +CHUO1 ch uo1 +CHUO2 ch uo2 +CHUO3 ch uo3 +CHUO4 ch uo4 +CI0 c iy0 +CI1 c iy1 +CI2 c iy2 +CI3 c iy3 +CI4 c iy4 +CONG0 c ong0 +CONG1 c ong1 +CONG2 c ong2 +CONG3 c ong3 +CONG4 c ong4 +COU0 c ou0 +COU1 c ou1 +COU2 c ou2 +COU3 c ou3 +COU4 c ou4 +CU0 c u0 +CU1 c u1 +CU2 c u2 +CU3 c u3 +CU4 c u4 +CUAN0 c uan0 +CUAN1 c uan1 +CUAN2 c uan2 +CUAN3 c uan3 +CUAN4 c uan4 +CUI0 c ui0 +CUI1 c ui1 +CUI2 c ui2 +CUI3 c ui3 +CUI4 c ui4 +CUN0 c un0 +CUN1 c un1 +CUN2 c un2 +CUN3 c un3 +CUN4 c un4 +CUO0 c uo0 +CUO1 c uo1 +CUO2 c uo2 +CUO3 c uo3 +CUO4 c uo4 +DA0 d a0 +DA1 d a1 +DA2 d a2 +DA3 d a3 +DA4 d a4 +DAI0 d ai0 +DAI1 d ai1 +DAI2 d ai2 +DAI3 d ai3 +DAI4 d ai4 +DAN0 d an0 +DAN1 d an1 +DAN2 d an2 +DAN3 d an3 +DAN4 d an4 +DANG0 d ang0 +DANG1 d ang1 +DANG2 d ang2 +DANG3 d ang3 +DANG4 d ang4 +DAO0 d ao0 +DAO1 d ao1 +DAO2 d ao2 +DAO3 d ao3 +DAO4 d ao4 +DE0 d e0 +DE1 d e1 +DE2 d e2 +DE3 d e3 +DE4 d e4 +DEI0 d ei0 +DEI1 d ei1 +DEI2 d ei2 +DEI3 d ei3 +DEI4 d ei4 +DEN0 d en0 +DEN1 d en1 +DEN2 d en2 +DEN3 d en3 +DEN4 d en4 +DENG0 d eng0 +DENG1 d eng1 +DENG2 d eng2 +DENG3 d eng3 +DENG4 d eng4 +DI0 d i0 +DI1 d i1 +DI2 d i2 +DI3 d i3 +DI4 d i4 +DIA0 d ia0 +DIA1 d ia1 +DIA2 d ia2 +DIA3 d ia3 +DIA4 d ia4 +DIAN0 d ian0 +DIAN1 d ian1 +DIAN2 d ian2 +DIAN3 d ian3 +DIAN4 d ian4 +DIAO0 d iao0 +DIAO1 d iao1 +DIAO2 d iao2 +DIAO3 d iao3 +DIAO4 d iao4 +DIE0 d ie0 +DIE1 d ie1 +DIE2 d ie2 +DIE3 d ie3 +DIE4 d ie4 +DING0 d ing0 +DING1 d ing1 +DING2 d ing2 +DING3 d ing3 +DING4 d ing4 +DIU0 d iu0 +DIU1 d iu1 +DIU2 d iu2 +DIU3 d iu3 +DIU4 d iu4 +DONG0 d ong0 +DONG1 d ong1 +DONG2 d ong2 +DONG3 d ong3 +DONG4 d ong4 +DOU0 d ou0 +DOU1 d ou1 +DOU2 d ou2 +DOU3 d ou3 +DOU4 d ou4 +DU0 d u0 +DU1 d u1 +DU2 d u2 +DU3 d u3 +DU4 d u4 +DUAN0 d uan0 +DUAN1 d uan1 +DUAN2 d uan2 +DUAN3 d uan3 +DUAN4 d uan4 +DUI0 d ui0 +DUI1 d ui1 +DUI2 d ui2 +DUI3 d ui3 +DUI4 d ui4 +DUN0 d un0 +DUN1 d un1 +DUN2 d un2 +DUN3 d un3 +DUN4 d un4 +DUO0 d uo0 +DUO1 d uo1 +DUO2 d uo2 +DUO3 d uo3 +DUO4 d uo4 +E0 ee e0 +E1 ee e1 +E2 ee e2 +E3 ee e3 +E4 ee e4 +EN0 ee en0 +EN1 ee en1 +EN2 ee en2 +EN3 ee en3 +EN4 ee en4 +ER0 ee er0 +ER1 ee er1 +ER2 ee er2 +ER3 ee er3 +ER4 ee er4 +FA0 f a0 +FA1 f a1 +FA2 f a2 +FA3 f a3 +FA4 f a4 +FAN0 f an0 +FAN1 f an1 +FAN2 f an2 +FAN3 f an3 +FAN4 f an4 +FANG0 f ang0 +FANG1 f ang1 +FANG2 f ang2 +FANG3 f ang3 +FANG4 f ang4 +FEI0 f ei0 +FEI1 f ei1 +FEI2 f ei2 +FEI3 f ei3 +FEI4 f ei4 +FEN0 f en0 +FEN1 f en1 +FEN2 f en2 +FEN3 f en3 +FEN4 f en4 +FENG0 f eng0 +FENG1 f eng1 +FENG2 f eng2 +FENG3 f eng3 +FENG4 f eng4 +FO0 f o0 +FO1 f o1 +FO2 f o2 +FO3 f o3 +FO4 f o4 +FOU0 f ou0 +FOU1 f ou1 +FOU2 f ou2 +FOU3 f ou3 +FOU4 f ou4 +FU0 f u0 +FU1 f u1 +FU2 f u2 +FU3 f u3 +FU4 f u4 +GA0 g a0 +GA1 g a1 +GA2 g a2 +GA3 g a3 +GA4 g a4 +GAI0 g ai0 +GAI1 g ai1 +GAI2 g ai2 +GAI3 g ai3 +GAI4 g ai4 +GAN0 g an0 +GAN1 g an1 +GAN2 g an2 +GAN3 g an3 +GAN4 g an4 +GANG0 g ang0 +GANG1 g ang1 +GANG2 g ang2 +GANG3 g ang3 +GANG4 g ang4 +GAO0 g ao0 +GAO1 g ao1 +GAO2 g ao2 +GAO3 g ao3 +GAO4 g ao4 +GE0 g e0 +GE1 g e1 +GE2 g e2 +GE3 g e3 +GE4 g e4 +GEI0 g ei0 +GEI1 g ei1 +GEI2 g ei2 +GEI3 g ei3 +GEI4 g ei4 +GEN0 g en0 +GEN1 g en1 +GEN2 g en2 +GEN3 g en3 +GEN4 g en4 +GENG0 g eng0 +GENG1 g eng1 +GENG2 g eng2 +GENG3 g eng3 +GENG4 g eng4 +GONG0 g ong0 +GONG1 g ong1 +GONG2 g ong2 +GONG3 g ong3 +GONG4 g ong4 +GOU0 g ou0 +GOU1 g ou1 +GOU2 g ou2 +GOU3 g ou3 +GOU4 g ou4 +GU0 g u0 +GU1 g u1 +GU2 g u2 +GU3 g u3 +GU4 g u4 +GUA0 g ua0 +GUA1 g ua1 +GUA2 g ua2 +GUA3 g ua3 +GUA4 g ua4 +GUAI0 g uai0 +GUAI1 g uai1 +GUAI2 g uai2 +GUAI3 g uai3 +GUAI4 g uai4 +GUAN0 g uan0 +GUAN1 g uan1 +GUAN2 g uan2 +GUAN3 g uan3 +GUAN4 g uan4 +GUANG0 g uang0 +GUANG1 g uang1 +GUANG2 g uang2 +GUANG3 g uang3 +GUANG4 g uang4 +GUI0 g ui0 +GUI1 g ui1 +GUI2 g ui2 +GUI3 g ui3 +GUI4 g ui4 +GUN0 g un0 +GUN1 g un1 +GUN2 g un2 +GUN3 g un3 +GUN4 g un4 +GUO0 g uo0 +GUO1 g uo1 +GUO2 g uo2 +GUO3 g uo3 +GUO4 g uo4 +HA0 h a0 +HA1 h a1 +HA2 h a2 +HA3 h a3 +HA4 h a4 +HAI0 h ai0 +HAI1 h ai1 +HAI2 h ai2 +HAI3 h ai3 +HAI4 h ai4 +HAN0 h an0 +HAN1 h an1 +HAN2 h an2 +HAN3 h an3 +HAN4 h an4 +HANG0 h ang0 +HANG1 h ang1 +HANG2 h ang2 +HANG3 h ang3 +HANG4 h ang4 +HAO0 h ao0 +HAO1 h ao1 +HAO2 h ao2 +HAO3 h ao3 +HAO4 h ao4 +HE0 h e0 +HE1 h e1 +HE2 h e2 +HE3 h e3 +HE4 h e4 +HEI0 h ei0 +HEI1 h ei1 +HEI2 h ei2 +HEI3 h ei3 +HEI4 h ei4 +HEN0 h en0 +HEN1 h en1 +HEN2 h en2 +HEN3 h en3 +HEN4 h en4 +HENG0 h eng0 +HENG1 h eng1 +HENG2 h eng2 +HENG3 h eng3 +HENG4 h eng4 +HONG0 h ong0 +HONG1 h ong1 +HONG2 h ong2 +HONG3 h ong3 +HONG4 h ong4 +HOU0 h ou0 +HOU1 h ou1 +HOU2 h ou2 +HOU3 h ou3 +HOU4 h ou4 +HU0 h u0 +HU1 h u1 +HU2 h u2 +HU3 h u3 +HU4 h u4 +HUA0 h ua0 +HUA1 h ua1 +HUA2 h ua2 +HUA3 h ua3 +HUA4 h ua4 +HUAI0 h uai0 +HUAI1 h uai1 +HUAI2 h uai2 +HUAI3 h uai3 +HUAI4 h uai4 +HUAN0 h uan0 +HUAN1 h uan1 +HUAN2 h uan2 +HUAN3 h uan3 +HUAN4 h uan4 +HUANG0 h uang0 +HUANG1 h uang1 +HUANG2 h uang2 +HUANG3 h uang3 +HUANG4 h uang4 +HUI0 h ui0 +HUI1 h ui1 +HUI2 h ui2 +HUI3 h ui3 +HUI4 h ui4 +HUN0 h un0 +HUN1 h un1 +HUN2 h un2 +HUN3 h un3 +HUN4 h un4 +HUO0 h uo0 +HUO1 h uo1 +HUO2 h uo2 +HUO3 h uo3 +HUO4 h uo4 +JI0 j i0 +JI1 j i1 +JI2 j i2 +JI3 j i3 +JI4 j i4 +JIA0 j ia0 +JIA1 j ia1 +JIA2 j ia2 +JIA3 j ia3 +JIA4 j ia4 +JIAN0 j ian0 +JIAN1 j ian1 +JIAN2 j ian2 +JIAN3 j ian3 +JIAN4 j ian4 +JIANG0 j iang0 +JIANG1 j iang1 +JIANG2 j iang2 +JIANG3 j iang3 +JIANG4 j iang4 +JIAO0 j iao0 +JIAO1 j iao1 +JIAO2 j iao2 +JIAO3 j iao3 +JIAO4 j iao4 +JIE0 j ie0 +JIE1 j ie1 +JIE2 j ie2 +JIE3 j ie3 +JIE4 j ie4 +JIN0 j in0 +JIN1 j in1 +JIN2 j in2 +JIN3 j in3 +JIN4 j in4 +JING0 j ing0 +JING1 j ing1 +JING2 j ing2 +JING3 j ing3 +JING4 j ing4 +JIONG0 j iong0 +JIONG1 j iong1 +JIONG2 j iong2 +JIONG3 j iong3 +JIONG4 j iong4 +JIU0 j iu0 +JIU1 j iu1 +JIU2 j iu2 +JIU3 j iu3 +JIU4 j iu4 +JU0 j v0 +JU1 j v1 +JU2 j v2 +JU3 j v3 +JU4 j v4 +JUAN0 j van0 +JUAN1 j van1 +JUAN2 j van2 +JUAN3 j van3 +JUAN4 j van4 +JUE0 j ve0 +JUE1 j ve1 +JUE2 j ve2 +JUE3 j ve3 +JUE4 j ve4 +JUN0 j vn0 +JUN1 j vn1 +JUN2 j vn2 +JUN3 j vn3 +JUN4 j vn4 +KA0 k a0 +KA1 k a1 +KA2 k a2 +KA3 k a3 +KA4 k a4 +KAI0 k ai0 +KAI1 k ai1 +KAI2 k ai2 +KAI3 k ai3 +KAI4 k ai4 +KAN0 k an0 +KAN1 k an1 +KAN2 k an2 +KAN3 k an3 +KAN4 k an4 +KANG0 k ang0 +KANG1 k ang1 +KANG2 k ang2 +KANG3 k ang3 +KANG4 k ang4 +KAO0 k ao0 +KAO1 k ao1 +KAO2 k ao2 +KAO3 k ao3 +KAO4 k ao4 +KE0 k e0 +KE1 k e1 +KE2 k e2 +KE3 k e3 +KE4 k e4 +KEI0 k ei0 +KEI1 k ei1 +KEI2 k ei2 +KEI3 k ei3 +KEI4 k ei4 +KEN0 k en0 +KEN1 k en1 +KEN2 k en2 +KEN3 k en3 +KEN4 k en4 +KENG0 k eng0 +KENG1 k eng1 +KENG2 k eng2 +KENG3 k eng3 +KENG4 k eng4 +KONG0 k ong0 +KONG1 k ong1 +KONG2 k ong2 +KONG3 k ong3 +KONG4 k ong4 +KOU0 k ou0 +KOU1 k ou1 +KOU2 k ou2 +KOU3 k ou3 +KOU4 k ou4 +KU0 k u0 +KU1 k u1 +KU2 k u2 +KU3 k u3 +KU4 k u4 +KUA0 k ua0 +KUA1 k ua1 +KUA2 k ua2 +KUA3 k ua3 +KUA4 k ua4 +KUAI0 k uai0 +KUAI1 k uai1 +KUAI2 k uai2 +KUAI3 k uai3 +KUAI4 k uai4 +KUAN0 k uan0 +KUAN1 k uan1 +KUAN2 k uan2 +KUAN3 k uan3 +KUAN4 k uan4 +KUANG0 k uang0 +KUANG1 k uang1 +KUANG2 k uang2 +KUANG3 k uang3 +KUANG4 k uang4 +KUI0 k ui0 +KUI1 k ui1 +KUI2 k ui2 +KUI3 k ui3 +KUI4 k ui4 +KUN0 k un0 +KUN1 k un1 +KUN2 k un2 +KUN3 k un3 +KUN4 k un4 +KUO0 k uo0 +KUO1 k uo1 +KUO2 k uo2 +KUO3 k uo3 +KUO4 k uo4 +LA0 l a0 +LA1 l a1 +LA2 l a2 +LA3 l a3 +LA4 l a4 +LAI0 l ai0 +LAI1 l ai1 +LAI2 l ai2 +LAI3 l ai3 +LAI4 l ai4 +LAN0 l an0 +LAN1 l an1 +LAN2 l an2 +LAN3 l an3 +LAN4 l an4 +LANG0 l ang0 +LANG1 l ang1 +LANG2 l ang2 +LANG3 l ang3 +LANG4 l ang4 +LAO0 l ao0 +LAO1 l ao1 +LAO2 l ao2 +LAO3 l ao3 +LAO4 l ao4 +LE0 l e0 +LE1 l e1 +LE2 l e2 +LE3 l e3 +LE4 l e4 +LEI0 l ei0 +LEI1 l ei1 +LEI2 l ei2 +LEI3 l ei3 +LEI4 l ei4 +LENG0 l eng0 +LENG1 l eng1 +LENG2 l eng2 +LENG3 l eng3 +LENG4 l eng4 +LI0 l i0 +LI1 l i1 +LI2 l i2 +LI3 l i3 +LI4 l i4 +LIA0 l ia0 +LIA1 l ia1 +LIA2 l ia2 +LIA3 l ia3 +LIA4 l ia4 +LIAN0 l ian0 +LIAN1 l ian1 +LIAN2 l ian2 +LIAN3 l ian3 +LIAN4 l ian4 +LIANG0 l iang0 +LIANG1 l iang1 +LIANG2 l iang2 +LIANG3 l iang3 +LIANG4 l iang4 +LIAO0 l iao0 +LIAO1 l iao1 +LIAO2 l iao2 +LIAO3 l iao3 +LIAO4 l iao4 +LIE0 l ie0 +LIE1 l ie1 +LIE2 l ie2 +LIE3 l ie3 +LIE4 l ie4 +LIN0 l in0 +LIN1 l in1 +LIN2 l in2 +LIN3 l in3 +LIN4 l in4 +LING0 l ing0 +LING1 l ing1 +LING2 l ing2 +LING3 l ing3 +LING4 l ing4 +LIU0 l iu0 +LIU1 l iu1 +LIU2 l iu2 +LIU3 l iu3 +LIU4 l iu4 +LONG0 l ong0 +LONG1 l ong1 +LONG2 l ong2 +LONG3 l ong3 +LONG4 l ong4 +LOU0 l ou0 +LOU1 l ou1 +LOU2 l ou2 +LOU3 l ou3 +LOU4 l ou4 +LU0 l u0 +LU1 l u1 +LU2 l u2 +LU3 l u3 +LU4 l u4 +LUAN0 l uan0 +LUAN1 l uan1 +LUAN2 l uan2 +LUAN3 l uan3 +LUAN4 l uan4 +LUE0 l ve0 +LUE1 l ve1 +LUE2 l ve2 +LUE3 l ve3 +LUE4 l ve4 +LVE0 l ve0 +LVE1 l ve1 +LVE2 l ve2 +LVE3 l ve3 +LVE4 l ve4 +LUN0 l un0 +LUN1 l un1 +LUN2 l un2 +LUN3 l un3 +LUN4 l un4 +LUO0 l uo0 +LUO1 l uo1 +LUO2 l uo2 +LUO3 l uo3 +LUO4 l uo4 +LV0 l v0 +LV1 l v1 +LV2 l v2 +LV3 l v3 +LV4 l v4 +MA0 m a0 +MA1 m a1 +MA2 m a2 +MA3 m a3 +MA4 m a4 +MAI0 m ai0 +MAI1 m ai1 +MAI2 m ai2 +MAI3 m ai3 +MAI4 m ai4 +MAN0 m an0 +MAN1 m an1 +MAN2 m an2 +MAN3 m an3 +MAN4 m an4 +MANG0 m ang0 +MANG1 m ang1 +MANG2 m ang2 +MANG3 m ang3 +MANG4 m ang4 +MAO0 m ao0 +MAO1 m ao1 +MAO2 m ao2 +MAO3 m ao3 +MAO4 m ao4 +ME0 m e0 +ME1 m e1 +ME2 m e2 +ME3 m e3 +ME4 m e4 +MEI0 m ei0 +MEI1 m ei1 +MEI2 m ei2 +MEI3 m ei3 +MEI4 m ei4 +MEN0 m en0 +MEN1 m en1 +MEN2 m en2 +MEN3 m en3 +MEN4 m en4 +MENG0 m eng0 +MENG1 m eng1 +MENG2 m eng2 +MENG3 m eng3 +MENG4 m eng4 +MI0 m i0 +MI1 m i1 +MI2 m i2 +MI3 m i3 +MI4 m i4 +MIAN0 m ian0 +MIAN1 m ian1 +MIAN2 m ian2 +MIAN3 m ian3 +MIAN4 m ian4 +MIAO0 m iao0 +MIAO1 m iao1 +MIAO2 m iao2 +MIAO3 m iao3 +MIAO4 m iao4 +MIE0 m ie0 +MIE1 m ie1 +MIE2 m ie2 +MIE3 m ie3 +MIE4 m ie4 +MIN0 m in0 +MIN1 m in1 +MIN2 m in2 +MIN3 m in3 +MIN4 m in4 +MING0 m ing0 +MING1 m ing1 +MING2 m ing2 +MING3 m ing3 +MING4 m ing4 +MIU0 m iu0 +MIU1 m iu1 +MIU2 m iu2 +MIU3 m iu3 +MIU4 m iu4 +MO0 m o0 +MO1 m o1 +MO2 m o2 +MO3 m o3 +MO4 m o4 +MOU0 m ou0 +MOU1 m ou1 +MOU2 m ou2 +MOU3 m ou3 +MOU4 m ou4 +MU0 m u0 +MU1 m u1 +MU2 m u2 +MU3 m u3 +MU4 m u4 +NA0 n a0 +NA1 n a1 +NA2 n a2 +NA3 n a3 +NA4 n a4 +NAI0 n ai0 +NAI1 n ai1 +NAI2 n ai2 +NAI3 n ai3 +NAI4 n ai4 +NAN0 n an0 +NAN1 n an1 +NAN2 n an2 +NAN3 n an3 +NAN4 n an4 +NANG0 n ang0 +NANG1 n ang1 +NANG2 n ang2 +NANG3 n ang3 +NANG4 n ang4 +NAO0 n ao0 +NAO1 n ao1 +NAO2 n ao2 +NAO3 n ao3 +NAO4 n ao4 +NE0 n e0 +NE1 n e1 +NE2 n e2 +NE3 n e3 +NE4 n e4 +NEI0 n ei0 +NEI1 n ei1 +NEI2 n ei2 +NEI3 n ei3 +NEI4 n ei4 +NEN0 n en0 +NEN1 n en1 +NEN2 n en2 +NEN3 n en3 +NEN4 n en4 +NENG0 n eng0 +NENG1 n eng1 +NENG2 n eng2 +NENG3 n eng3 +NENG4 n eng4 +NI0 n i0 +NI1 n i1 +NI2 n i2 +NI3 n i3 +NI4 n i4 +NIAN0 n ian0 +NIAN1 n ian1 +NIAN2 n ian2 +NIAN3 n ian3 +NIAN4 n ian4 +NIANG0 n iang0 +NIANG1 n iang1 +NIANG2 n iang2 +NIANG3 n iang3 +NIANG4 n iang4 +NIAO0 n iao0 +NIAO1 n iao1 +NIAO2 n iao2 +NIAO3 n iao3 +NIAO4 n iao4 +NIE0 n ie0 +NIE1 n ie1 +NIE2 n ie2 +NIE3 n ie3 +NIE4 n ie4 +NIN0 n in0 +NIN1 n in1 +NIN2 n in2 +NIN3 n in3 +NIN4 n in4 +NING0 n ing0 +NING1 n ing1 +NING2 n ing2 +NING3 n ing3 +NING4 n ing4 +NIU0 n iu0 +NIU1 n iu1 +NIU2 n iu2 +NIU3 n iu3 +NIU4 n iu4 +NONG0 n ong0 +NONG1 n ong1 +NONG2 n ong2 +NONG3 n ong3 +NONG4 n ong4 +NU0 n u0 +NU1 n u1 +NU2 n u2 +NU3 n u3 +NU4 n u4 +NUAN0 n uan0 +NUAN1 n uan1 +NUAN2 n uan2 +NUAN3 n uan3 +NUAN4 n uan4 +NUE0 n ve0 +NUE1 n ve1 +NUE2 n ve2 +NUE3 n ve3 +NUE4 n ve4 +NVE0 n ve0 +NVE1 n ve1 +NVE2 n ve2 +NVE3 n ve3 +NVE4 n ve4 +NUO0 n uo0 +NUO1 n uo1 +NUO2 n uo2 +NUO3 n uo3 +NUO4 n uo4 +NV0 n v0 +NV1 n v1 +NV2 n v2 +NV3 n v3 +NV4 n v4 +O0 oo o0 +O1 oo o1 +O2 oo o2 +O3 oo o3 +O4 oo o4 +OU0 oo ou0 +OU1 oo ou1 +OU2 oo ou2 +OU3 oo ou3 +OU4 oo ou4 +PA0 p a0 +PA1 p a1 +PA2 p a2 +PA3 p a3 +PA4 p a4 +PAI0 p ai0 +PAI1 p ai1 +PAI2 p ai2 +PAI3 p ai3 +PAI4 p ai4 +PAN0 p an0 +PAN1 p an1 +PAN2 p an2 +PAN3 p an3 +PAN4 p an4 +PANG0 p ang0 +PANG1 p ang1 +PANG2 p ang2 +PANG3 p ang3 +PANG4 p ang4 +PAO0 p ao0 +PAO1 p ao1 +PAO2 p ao2 +PAO3 p ao3 +PAO4 p ao4 +PEI0 p ei0 +PEI1 p ei1 +PEI2 p ei2 +PEI3 p ei3 +PEI4 p ei4 +PEN0 p en0 +PEN1 p en1 +PEN2 p en2 +PEN3 p en3 +PEN4 p en4 +PENG0 p eng0 +PENG1 p eng1 +PENG2 p eng2 +PENG3 p eng3 +PENG4 p eng4 +PI0 p i0 +PI1 p i1 +PI2 p i2 +PI3 p i3 +PI4 p i4 +PIAN0 p ian0 +PIAN1 p ian1 +PIAN2 p ian2 +PIAN3 p ian3 +PIAN4 p ian4 +PIAO0 p iao0 +PIAO1 p iao1 +PIAO2 p iao2 +PIAO3 p iao3 +PIAO4 p iao4 +PIE0 p ie0 +PIE1 p ie1 +PIE2 p ie2 +PIE3 p ie3 +PIE4 p ie4 +PIN0 p in0 +PIN1 p in1 +PIN2 p in2 +PIN3 p in3 +PIN4 p in4 +PING0 p ing0 +PING1 p ing1 +PING2 p ing2 +PING3 p ing3 +PING4 p ing4 +PO0 p o0 +PO1 p o1 +PO2 p o2 +PO3 p o3 +PO4 p o4 +POU0 p ou0 +POU1 p ou1 +POU2 p ou2 +POU3 p ou3 +POU4 p ou4 +PU0 p u0 +PU1 p u1 +PU2 p u2 +PU3 p u3 +PU4 p u4 +QI0 q i0 +QI1 q i1 +QI2 q i2 +QI3 q i3 +QI4 q i4 +QIA0 q ia0 +QIA1 q ia1 +QIA2 q ia2 +QIA3 q ia3 +QIA4 q ia4 +QIAN0 q ian0 +QIAN1 q ian1 +QIAN2 q ian2 +QIAN3 q ian3 +QIAN4 q ian4 +QIANG0 q iang0 +QIANG1 q iang1 +QIANG2 q iang2 +QIANG3 q iang3 +QIANG4 q iang4 +QIAO0 q iao0 +QIAO1 q iao1 +QIAO2 q iao2 +QIAO3 q iao3 +QIAO4 q iao4 +QIE0 q ie0 +QIE1 q ie1 +QIE2 q ie2 +QIE3 q ie3 +QIE4 q ie4 +QIN0 q in0 +QIN1 q in1 +QIN2 q in2 +QIN3 q in3 +QIN4 q in4 +QING0 q ing0 +QING1 q ing1 +QING2 q ing2 +QING3 q ing3 +QING4 q ing4 +QIONG0 q iong0 +QIONG1 q iong1 +QIONG2 q iong2 +QIONG3 q iong3 +QIONG4 q iong4 +QIU0 q iu0 +QIU1 q iu1 +QIU2 q iu2 +QIU3 q iu3 +QIU4 q iu4 +QU0 q v0 +QU1 q v1 +QU2 q v2 +QU3 q v3 +QU4 q v4 +QUAN0 q van0 +QUAN1 q van1 +QUAN2 q van2 +QUAN3 q van3 +QUAN4 q van4 +QUE0 q ve0 +QUE1 q ve1 +QUE2 q ve2 +QUE3 q ve3 +QUE4 q ve4 +QUN0 q vn0 +QUN1 q vn1 +QUN2 q vn2 +QUN3 q vn3 +QUN4 q vn4 +RAN0 r an0 +RAN1 r an1 +RAN2 r an2 +RAN3 r an3 +RAN4 r an4 +RANG0 r ang0 +RANG1 r ang1 +RANG2 r ang2 +RANG3 r ang3 +RANG4 r ang4 +RAO0 r ao0 +RAO1 r ao1 +RAO2 r ao2 +RAO3 r ao3 +RAO4 r ao4 +RE0 r e0 +RE1 r e1 +RE2 r e2 +RE3 r e3 +RE4 r e4 +REN0 r en0 +REN1 r en1 +REN2 r en2 +REN3 r en3 +REN4 r en4 +RENG0 r eng0 +RENG1 r eng1 +RENG2 r eng2 +RENG3 r eng3 +RENG4 r eng4 +RI0 r iz0 +RI1 r iz1 +RI2 r iz2 +RI3 r iz3 +RI4 r iz4 +RONG0 r ong0 +RONG1 r ong1 +RONG2 r ong2 +RONG3 r ong3 +RONG4 r ong4 +ROU0 r ou0 +ROU1 r ou1 +ROU2 r ou2 +ROU3 r ou3 +ROU4 r ou4 +RU0 r u0 +RU1 r u1 +RU2 r u2 +RU3 r u3 +RU4 r u4 +RUAN0 r uan0 +RUAN1 r uan1 +RUAN2 r uan2 +RUAN3 r uan3 +RUAN4 r uan4 +RUI0 r ui0 +RUI1 r ui1 +RUI2 r ui2 +RUI3 r ui3 +RUI4 r ui4 +RUN0 r un0 +RUN1 r un1 +RUN2 r un2 +RUN3 r un3 +RUN4 r un4 +RUO0 r uo0 +RUO1 r uo1 +RUO2 r uo2 +RUO3 r uo3 +RUO4 r uo4 +SA0 s a0 +SA1 s a1 +SA2 s a2 +SA3 s a3 +SA4 s a4 +SAI0 s ai0 +SAI1 s ai1 +SAI2 s ai2 +SAI3 s ai3 +SAI4 s ai4 +SAN0 s an0 +SAN1 s an1 +SAN2 s an2 +SAN3 s an3 +SAN4 s an4 +SANG0 s ang0 +SANG1 s ang1 +SANG2 s ang2 +SANG3 s ang3 +SANG4 s ang4 +SAO0 s ao0 +SAO1 s ao1 +SAO2 s ao2 +SAO3 s ao3 +SAO4 s ao4 +SE0 s e0 +SE1 s e1 +SE2 s e2 +SE3 s e3 +SE4 s e4 +SEN0 s en0 +SEN1 s en1 +SEN2 s en2 +SEN3 s en3 +SEN4 s en4 +SENG0 s eng0 +SENG1 s eng1 +SENG2 s eng2 +SENG3 s eng3 +SENG4 s eng4 +SHA0 sh a0 +SHA1 sh a1 +SHA2 sh a2 +SHA3 sh a3 +SHA4 sh a4 +SHAI0 sh ai0 +SHAI1 sh ai1 +SHAI2 sh ai2 +SHAI3 sh ai3 +SHAI4 sh ai4 +SHAN0 sh an0 +SHAN1 sh an1 +SHAN2 sh an2 +SHAN3 sh an3 +SHAN4 sh an4 +SHANG0 sh ang0 +SHANG1 sh ang1 +SHANG2 sh ang2 +SHANG3 sh ang3 +SHANG4 sh ang4 +SHAO0 sh ao0 +SHAO1 sh ao1 +SHAO2 sh ao2 +SHAO3 sh ao3 +SHAO4 sh ao4 +SHE0 sh e0 +SHE1 sh e1 +SHE2 sh e2 +SHE3 sh e3 +SHE4 sh e4 +SHEI0 sh ei0 +SHEI1 sh ei1 +SHEI2 sh ei2 +SHEI3 sh ei3 +SHEI4 sh ei4 +SHEN0 sh en0 +SHEN1 sh en1 +SHEN2 sh en2 +SHEN3 sh en3 +SHEN4 sh en4 +SHENG0 sh eng0 +SHENG1 sh eng1 +SHENG2 sh eng2 +SHENG3 sh eng3 +SHENG4 sh eng4 +SHI0 sh ix0 +SHI1 sh ix1 +SHI2 sh ix2 +SHI3 sh ix3 +SHI4 sh ix4 +SHOU0 sh ou0 +SHOU1 sh ou1 +SHOU2 sh ou2 +SHOU3 sh ou3 +SHOU4 sh ou4 +SHU0 sh u0 +SHU1 sh u1 +SHU2 sh u2 +SHU3 sh u3 +SHU4 sh u4 +SHUA0 sh ua0 +SHUA1 sh ua1 +SHUA2 sh ua2 +SHUA3 sh ua3 +SHUA4 sh ua4 +SHUAI0 sh uai0 +SHUAI1 sh uai1 +SHUAI2 sh uai2 +SHUAI3 sh uai3 +SHUAI4 sh uai4 +SHUAN0 sh uan0 +SHUAN1 sh uan1 +SHUAN2 sh uan2 +SHUAN3 sh uan3 +SHUAN4 sh uan4 +SHUANG0 sh uang0 +SHUANG1 sh uang1 +SHUANG2 sh uang2 +SHUANG3 sh uang3 +SHUANG4 sh uang4 +SHUI0 sh ui0 +SHUI1 sh ui1 +SHUI2 sh ui2 +SHUI3 sh ui3 +SHUI4 sh ui4 +SHUN0 sh un0 +SHUN1 sh un1 +SHUN2 sh un2 +SHUN3 sh un3 +SHUN4 sh un4 +SHUO0 sh uo0 +SHUO1 sh uo1 +SHUO2 sh uo2 +SHUO3 sh uo3 +SHUO4 sh uo4 +SI0 s iy0 +SI1 s iy1 +SI2 s iy2 +SI3 s iy3 +SI4 s iy4 +SONG0 s ong0 +SONG1 s ong1 +SONG2 s ong2 +SONG3 s ong3 +SONG4 s ong4 +SOU0 s ou0 +SOU1 s ou1 +SOU2 s ou2 +SOU3 s ou3 +SOU4 s ou4 +SU0 s u0 +SU1 s u1 +SU2 s u2 +SU3 s u3 +SU4 s u4 +SUAN0 s uan0 +SUAN1 s uan1 +SUAN2 s uan2 +SUAN3 s uan3 +SUAN4 s uan4 +SUI0 s ui0 +SUI1 s ui1 +SUI2 s ui2 +SUI3 s ui3 +SUI4 s ui4 +SUN0 s un0 +SUN1 s un1 +SUN2 s un2 +SUN3 s un3 +SUN4 s un4 +SUO0 s uo0 +SUO1 s uo1 +SUO2 s uo2 +SUO3 s uo3 +SUO4 s uo4 +TA0 t a0 +TA1 t a1 +TA2 t a2 +TA3 t a3 +TA4 t a4 +TAI0 t ai0 +TAI1 t ai1 +TAI2 t ai2 +TAI3 t ai3 +TAI4 t ai4 +TAN0 t an0 +TAN1 t an1 +TAN2 t an2 +TAN3 t an3 +TAN4 t an4 +TANG0 t ang0 +TANG1 t ang1 +TANG2 t ang2 +TANG3 t ang3 +TANG4 t ang4 +TAO0 t ao0 +TAO1 t ao1 +TAO2 t ao2 +TAO3 t ao3 +TAO4 t ao4 +TE0 t e0 +TE1 t e1 +TE2 t e2 +TE3 t e3 +TE4 t e4 +TENG0 t eng0 +TENG1 t eng1 +TENG2 t eng2 +TENG3 t eng3 +TENG4 t eng4 +TI0 t i0 +TI1 t i1 +TI2 t i2 +TI3 t i3 +TI4 t i4 +TIAN0 t ian0 +TIAN1 t ian1 +TIAN2 t ian2 +TIAN3 t ian3 +TIAN4 t ian4 +TIAO0 t iao0 +TIAO1 t iao1 +TIAO2 t iao2 +TIAO3 t iao3 +TIAO4 t iao4 +TIE0 t ie0 +TIE1 t ie1 +TIE2 t ie2 +TIE3 t ie3 +TIE4 t ie4 +TING0 t ing0 +TING1 t ing1 +TING2 t ing2 +TING3 t ing3 +TING4 t ing4 +TONG0 t ong0 +TONG1 t ong1 +TONG2 t ong2 +TONG3 t ong3 +TONG4 t ong4 +TOU0 t ou0 +TOU1 t ou1 +TOU2 t ou2 +TOU3 t ou3 +TOU4 t ou4 +TU0 t u0 +TU1 t u1 +TU2 t u2 +TU3 t u3 +TU4 t u4 +TUAN0 t uan0 +TUAN1 t uan1 +TUAN2 t uan2 +TUAN3 t uan3 +TUAN4 t uan4 +TUI0 t ui0 +TUI1 t ui1 +TUI2 t ui2 +TUI3 t ui3 +TUI4 t ui4 +TUN0 t un0 +TUN1 t un1 +TUN2 t un2 +TUN3 t un3 +TUN4 t un4 +TUO0 t uo0 +TUO1 t uo1 +TUO2 t uo2 +TUO3 t uo3 +TUO4 t uo4 +WA0 uu ua0 +WA1 uu ua1 +WA2 uu ua2 +WA3 uu ua3 +WA4 uu ua4 +WAI0 uu uai0 +WAI1 uu uai1 +WAI2 uu uai2 +WAI3 uu uai3 +WAI4 uu uai4 +WAN0 uu uan0 +WAN1 uu uan1 +WAN2 uu uan2 +WAN3 uu uan3 +WAN4 uu uan4 +WANG0 uu uang0 +WANG1 uu uang1 +WANG2 uu uang2 +WANG3 uu uang3 +WANG4 uu uang4 +WEI0 uu ui0 +WEI1 uu ui1 +WEI2 uu ui2 +WEI3 uu ui3 +WEI4 uu ui4 +WEN0 uu un0 +WEN1 uu un1 +WEN2 uu un2 +WEN3 uu un3 +WEN4 uu un4 +WENG0 uu ueng0 +WENG1 uu ueng1 +WENG2 uu ueng2 +WENG3 uu ueng3 +WENG4 uu ueng4 +WO0 uu uo0 +WO1 uu uo1 +WO2 uu uo2 +WO3 uu uo3 +WO4 uu uo4 +WU0 uu u0 +WU1 uu u1 +WU2 uu u2 +WU3 uu u3 +WU4 uu u4 +XI0 x i0 +XI1 x i1 +XI2 x i2 +XI3 x i3 +XI4 x i4 +XIA0 x ia0 +XIA1 x ia1 +XIA2 x ia2 +XIA3 x ia3 +XIA4 x ia4 +XIAN0 x ian0 +XIAN1 x ian1 +XIAN2 x ian2 +XIAN3 x ian3 +XIAN4 x ian4 +XIANG0 x iang0 +XIANG1 x iang1 +XIANG2 x iang2 +XIANG3 x iang3 +XIANG4 x iang4 +XIAO0 x iao0 +XIAO1 x iao1 +XIAO2 x iao2 +XIAO3 x iao3 +XIAO4 x iao4 +XIE0 x ie0 +XIE1 x ie1 +XIE2 x ie2 +XIE3 x ie3 +XIE4 x ie4 +XIN0 x in0 +XIN1 x in1 +XIN2 x in2 +XIN3 x in3 +XIN4 x in4 +XING0 x ing0 +XING1 x ing1 +XING2 x ing2 +XING3 x ing3 +XING4 x ing4 +XIONG0 x iong0 +XIONG1 x iong1 +XIONG2 x iong2 +XIONG3 x iong3 +XIONG4 x iong4 +XIU0 x iu0 +XIU1 x iu1 +XIU2 x iu2 +XIU3 x iu3 +XIU4 x iu4 +XU0 x v0 +XU1 x v1 +XU2 x v2 +XU3 x v3 +XU4 x v4 +XUAN0 x van0 +XUAN1 x van1 +XUAN2 x van2 +XUAN3 x van3 +XUAN4 x van4 +XUE0 x ve0 +XUE1 x ve1 +XUE2 x ve2 +XUE3 x ve3 +XUE4 x ve4 +XUN0 x vn0 +XUN1 x vn1 +XUN2 x vn2 +XUN3 x vn3 +XUN4 x vn4 +YA0 ii ia0 +YA1 ii ia1 +YA2 ii ia2 +YA3 ii ia3 +YA4 ii ia4 +YAN0 ii ian0 +YAN1 ii ian1 +YAN2 ii ian2 +YAN3 ii ian3 +YAN4 ii ian4 +YANG0 ii iang0 +YANG1 ii iang1 +YANG2 ii iang2 +YANG3 ii iang3 +YANG4 ii iang4 +YAO0 ii iao0 +YAO1 ii iao1 +YAO2 ii iao2 +YAO3 ii iao3 +YAO4 ii iao4 +YE0 ii ie0 +YE1 ii ie1 +YE2 ii ie2 +YE3 ii ie3 +YE4 ii ie4 +YI0 ii i0 +YI1 ii i1 +YI2 ii i2 +YI3 ii i3 +YI4 ii i4 +YIN0 ii in0 +YIN1 ii in1 +YIN2 ii in2 +YIN3 ii in3 +YIN4 ii in4 +YING0 ii ing0 +YING1 ii ing1 +YING2 ii ing2 +YING3 ii ing3 +YING4 ii ing4 +YO0 ii ou0 +YO1 ii ou1 +YO2 ii ou2 +YO3 ii ou3 +YO4 ii ou4 +YONG0 ii iong0 +YONG1 ii iong1 +YONG2 ii iong2 +YONG3 ii iong3 +YONG4 ii iong4 +YOU0 ii iu0 +YOU1 ii iu1 +YOU2 ii iu2 +YOU3 ii iu3 +YOU4 ii iu4 +YU0 vv v0 +YU1 vv v1 +YU2 vv v2 +YU3 vv v3 +YU4 vv v4 +YUAN0 vv van0 +YUAN1 vv van1 +YUAN2 vv van2 +YUAN3 vv van3 +YUAN4 vv van4 +YUE0 vv ve0 +YUE1 vv ve1 +YUE2 vv ve2 +YUE3 vv ve3 +YUE4 vv ve4 +YUN0 vv vn0 +YUN1 vv vn1 +YUN2 vv vn2 +YUN3 vv vn3 +YUN4 vv vn4 +YUO0 ii ou0 +YUO1 ii ou1 +YUO2 ii ou2 +YUO3 ii ou3 +YUO4 ii ou4 +ZA0 z a0 +ZA1 z a1 +ZA2 z a2 +ZA3 z a3 +ZA4 z a4 +ZAI0 z ai0 +ZAI1 z ai1 +ZAI2 z ai2 +ZAI3 z ai3 +ZAI4 z ai4 +ZAN0 z an0 +ZAN1 z an1 +ZAN2 z an2 +ZAN3 z an3 +ZAN4 z an4 +ZANG0 z ang0 +ZANG1 z ang1 +ZANG2 z ang2 +ZANG3 z ang3 +ZANG4 z ang4 +ZAO0 z ao0 +ZAO1 z ao1 +ZAO2 z ao2 +ZAO3 z ao3 +ZAO4 z ao4 +ZE0 z e0 +ZE1 z e1 +ZE2 z e2 +ZE3 z e3 +ZE4 z e4 +ZEI0 z ei0 +ZEI1 z ei1 +ZEI2 z ei2 +ZEI3 z ei3 +ZEI4 z ei4 +ZEN0 z en0 +ZEN1 z en1 +ZEN2 z en2 +ZEN3 z en3 +ZEN4 z en4 +ZENG0 z eng0 +ZENG1 z eng1 +ZENG2 z eng2 +ZENG3 z eng3 +ZENG4 z eng4 +ZHA0 zh a0 +ZHA1 zh a1 +ZHA2 zh a2 +ZHA3 zh a3 +ZHA4 zh a4 +ZHAI0 zh ai0 +ZHAI1 zh ai1 +ZHAI2 zh ai2 +ZHAI3 zh ai3 +ZHAI4 zh ai4 +ZHAN0 zh an0 +ZHAN1 zh an1 +ZHAN2 zh an2 +ZHAN3 zh an3 +ZHAN4 zh an4 +ZHANG0 zh ang0 +ZHANG1 zh ang1 +ZHANG2 zh ang2 +ZHANG3 zh ang3 +ZHANG4 zh ang4 +ZHAO0 zh ao0 +ZHAO1 zh ao1 +ZHAO2 zh ao2 +ZHAO3 zh ao3 +ZHAO4 zh ao4 +ZHE0 zh e0 +ZHE1 zh e1 +ZHE2 zh e2 +ZHE3 zh e3 +ZHE4 zh e4 +ZHEI0 zh ei0 +ZHEI1 zh ei1 +ZHEI2 zh ei2 +ZHEI3 zh ei3 +ZHEI4 zh ei4 +ZHEN0 zh en0 +ZHEN1 zh en1 +ZHEN2 zh en2 +ZHEN3 zh en3 +ZHEN4 zh en4 +ZHENG0 zh eng0 +ZHENG1 zh eng1 +ZHENG2 zh eng2 +ZHENG3 zh eng3 +ZHENG4 zh eng4 +ZHI0 zh ix0 +ZHI1 zh ix1 +ZHI2 zh ix2 +ZHI3 zh ix3 +ZHI4 zh ix4 +ZHONG0 zh ong0 +ZHONG1 zh ong1 +ZHONG2 zh ong2 +ZHONG3 zh ong3 +ZHONG4 zh ong4 +ZHOU0 zh ou0 +ZHOU1 zh ou1 +ZHOU2 zh ou2 +ZHOU3 zh ou3 +ZHOU4 zh ou4 +ZHU0 zh u0 +ZHU1 zh u1 +ZHU2 zh u2 +ZHU3 zh u3 +ZHU4 zh u4 +ZHUA0 zh ua0 +ZHUA1 zh ua1 +ZHUA2 zh ua2 +ZHUA3 zh ua3 +ZHUA4 zh ua4 +ZHUAI0 zh uai0 +ZHUAI1 zh uai1 +ZHUAI2 zh uai2 +ZHUAI3 zh uai3 +ZHUAI4 zh uai4 +ZHUAN0 zh uan0 +ZHUAN1 zh uan1 +ZHUAN2 zh uan2 +ZHUAN3 zh uan3 +ZHUAN4 zh uan4 +ZHUANG0 zh uang0 +ZHUANG1 zh uang1 +ZHUANG2 zh uang2 +ZHUANG3 zh uang3 +ZHUANG4 zh uang4 +ZHUI0 zh ui0 +ZHUI1 zh ui1 +ZHUI2 zh ui2 +ZHUI3 zh ui3 +ZHUI4 zh ui4 +ZHUN0 zh un0 +ZHUN1 zh un1 +ZHUN2 zh un2 +ZHUN3 zh un3 +ZHUN4 zh un4 +ZHUO0 zh uo0 +ZHUO1 zh uo1 +ZHUO2 zh uo2 +ZHUO3 zh uo3 +ZHUO4 zh uo4 +ZI0 z iy0 +ZI1 z iy1 +ZI2 z iy2 +ZI3 z iy3 +ZI4 z iy4 +ZONG0 z ong0 +ZONG1 z ong1 +ZONG2 z ong2 +ZONG3 z ong3 +ZONG4 z ong4 +ZOU0 z ou0 +ZOU1 z ou1 +ZOU2 z ou2 +ZOU3 z ou3 +ZOU4 z ou4 +ZU0 z u0 +ZU1 z u1 +ZU2 z u2 +ZU3 z u3 +ZU4 z u4 +ZUAN0 z uan0 +ZUAN1 z uan1 +ZUAN2 z uan2 +ZUAN3 z uan3 +ZUAN4 z uan4 +ZUI0 z ui0 +ZUI1 z ui1 +ZUI2 z ui2 +ZUI3 z ui3 +ZUI4 z ui4 +ZUN0 z un0 +ZUN1 z un1 +ZUN2 z un2 +ZUN3 z un3 +ZUN4 z un4 +ZUO0 z uo0 +ZUO1 z uo1 +ZUO2 z uo2 +ZUO3 z uo3 +ZUO4 z uo4 +EI0 ee ei0 +EI1 ee ei1 +EI2 ee ei2 +EI3 ee ei3 +EI4 ee ei4 +TEI0 t ei0 +TEI1 t ei1 +TEI2 t ei2 +TEI3 t ei3 +TEI4 t ei4 +HNG0 ee eng0 +HNG1 ee eng1 +HNG2 ee eng2 +HNG3 ee eng3 +HNG4 ee eng4 +LO0 l o0 +LO1 l o1 +LO2 l o2 +LO3 l o3 +LO4 l o4 +N0 ee en0 +N1 ee en1 +N2 ee en2 +N3 ee en3 +N4 ee en4 +NG0 ee eng0 +NG1 ee eng1 +NG2 ee eng2 +NG3 ee eng3 +NG4 ee eng4 +NOU0 n ao0 +NOU1 n ao1 +NOU2 n ao2 +NOU3 n ao3 +NOU4 n ao4 +SEI0 s ei0 +SEI1 s ei1 +SEI2 s ei2 +SEI3 s ei3 +SEI4 s ei4 +A5 aa a5 +AI5 aa ai5 +AN5 aa an5 +ANG5 aa ang5 +AO5 aa ao5 +BA5 b a5 +BAI5 b ai5 +BAN5 b an5 +BANG5 b ang5 +BAO5 b ao5 +BEI5 b ei5 +BEN5 b en5 +BENG5 b eng5 +BI5 b i5 +BIAN5 b ian5 +BIAO5 b iao5 +BIE5 b ie5 +BIN5 b in5 +BING5 b ing5 +BO5 b o5 +BU5 b u5 +CA5 c a5 +CAI5 c ai5 +CAN5 c an5 +CANG5 c ang5 +CAO5 c ao5 +CE5 c e5 +CEN5 c en5 +CENG5 c eng5 +CHA5 ch a5 +CHAI5 ch ai5 +CHAN5 ch an5 +CHANG5 ch ang5 +CHAO5 ch ao5 +CHE5 ch e5 +CHEN5 ch en5 +CHENG5 ch eng5 +CHI5 ch ix5 +CHONG5 ch ong5 +CHOU5 ch ou5 +CHU5 ch u5 +CHUAI5 ch uai5 +CHUAN5 ch uan5 +CHUANG5 ch uang5 +CHUI5 ch ui5 +CHUN5 ch un5 +CHUO5 ch uo5 +CI5 c iy5 +CONG5 c ong5 +COU5 c ou5 +CU5 c u5 +CUAN5 c uan5 +CUI5 c ui5 +CUN5 c un5 +CUO5 c uo5 +DA5 d a5 +DAI5 d ai5 +DAN5 d an5 +DANG5 d ang5 +DAO5 d ao5 +DE5 d e5 +DEI5 d ei5 +DEN5 d en5 +DENG5 d eng5 +DI5 d i5 +DIA5 d ia5 +DIAN5 d ian5 +DIAO5 d iao5 +DIE5 d ie5 +DING5 d ing5 +DIU5 d iu5 +DONG5 d ong5 +DOU5 d ou5 +DU5 d u5 +DUAN5 d uan5 +DUI5 d ui5 +DUN5 d un5 +DUO5 d uo5 +E5 ee e5 +EN5 ee en5 +ER5 ee er5 +FA5 f a5 +FAN5 f an5 +FANG5 f ang5 +FEI5 f ei5 +FEN5 f en5 +FENG5 f eng5 +FO5 f o5 +FOU5 f ou5 +FU5 f u5 +GA5 g a5 +GAI5 g ai5 +GAN5 g an5 +GANG5 g ang5 +GAO5 g ao5 +GE5 g e5 +GEI5 g ei5 +GEN5 g en5 +GENG5 g eng5 +GONG5 g ong5 +GOU5 g ou5 +GU5 g u5 +GUA5 g ua5 +GUAI5 g uai5 +GUAN5 g uan5 +GUANG5 g uang5 +GUI5 g ui5 +GUN5 g un5 +GUO5 g uo5 +HA5 h a5 +HAI5 h ai5 +HAN5 h an5 +HANG5 h ang5 +HAO5 h ao5 +HE5 h e5 +HEI5 h ei5 +HEN5 h en5 +HENG5 h eng5 +HONG5 h ong5 +HOU5 h ou5 +HU5 h u5 +HUA5 h ua5 +HUAI5 h uai5 +HUAN5 h uan5 +HUANG5 h uang5 +HUI5 h ui5 +HUN5 h un5 +HUO5 h uo5 +JI5 j i5 +JIA5 j ia5 +JIAN5 j ian5 +JIANG5 j iang5 +JIAO5 j iao5 +JIE5 j ie5 +JIN5 j in5 +JING5 j ing5 +JIONG5 j iong5 +JIU5 j iu5 +JU5 j v5 +JUAN5 j van5 +JUE5 j ve5 +JUN5 j vn5 +KA5 k a5 +KAI5 k ai5 +KAN5 k an5 +KANG5 k ang5 +KAO5 k ao5 +KE5 k e5 +KEI5 k ei5 +KEN5 k en5 +KENG5 k eng5 +KONG5 k ong5 +KOU5 k ou5 +KU5 k u5 +KUA5 k ua5 +KUAI5 k uai5 +KUAN5 k uan5 +KUANG5 k uang5 +KUI5 k ui5 +KUN5 k un5 +KUO5 k uo5 +LA5 l a5 +LAI5 l ai5 +LAN5 l an5 +LANG5 l ang5 +LAO5 l ao5 +LE5 l e5 +LEI5 l ei5 +LENG5 l eng5 +LI5 l i5 +LIA5 l ia5 +LIAN5 l ian5 +LIANG5 l iang5 +LIAO5 l iao5 +LIE5 l ie5 +LIN5 l in5 +LING5 l ing5 +LIU5 l iu5 +LONG5 l ong5 +LOU5 l ou5 +LU5 l u5 +LUAN5 l uan5 +LUE5 l ve5 +LVE5 l ve5 +LUN5 l un5 +LUO5 l uo5 +LV5 l v5 +MA5 m a5 +MAI5 m ai5 +MAN5 m an5 +MANG5 m ang5 +MAO5 m ao5 +ME5 m e5 +MEI5 m ei5 +MEN5 m en5 +MENG5 m eng5 +MI5 m i5 +MIAN5 m ian5 +MIAO5 m iao5 +MIE5 m ie5 +MIN5 m in5 +MING5 m ing5 +MIU5 m iu5 +MO5 m o5 +MOU5 m ou5 +MU5 m u5 +NA5 n a5 +NAI5 n ai5 +NAN5 n an5 +NANG5 n ang5 +NAO5 n ao5 +NE5 n e5 +NEI5 n ei5 +NEN5 n en5 +NENG5 n eng5 +NI5 n i5 +NIAN5 n ian5 +NIANG5 n iang5 +NIAO5 n iao5 +NIE5 n ie5 +NIN5 n in5 +NING5 n ing5 +NIU5 n iu5 +NONG5 n ong5 +NU5 n u5 +NUAN5 n uan5 +NUE5 n ve5 +NVE5 n ve5 +NUO5 n uo5 +NV5 n v5 +O5 oo o5 +OU5 oo ou5 +PA5 p a5 +PAI5 p ai5 +PAN5 p an5 +PANG5 p ang5 +PAO5 p ao5 +PEI5 p ei5 +PEN5 p en5 +PENG5 p eng5 +PI5 p i5 +PIAN5 p ian5 +PIAO5 p iao5 +PIE5 p ie5 +PIN5 p in5 +PING5 p ing5 +PO5 p o5 +POU5 p ou5 +PU5 p u5 +QI5 q i5 +QIA5 q ia5 +QIAN5 q ian5 +QIANG5 q iang5 +QIAO5 q iao5 +QIE5 q ie5 +QIN5 q in5 +QING5 q ing5 +QIONG5 q iong5 +QIU5 q iu5 +QU5 q v5 +QUAN5 q van5 +QUE5 q ve5 +QUN5 q vn5 +RAN5 r an5 +RANG5 r ang5 +RAO5 r ao5 +RE5 r e5 +REN5 r en5 +RENG5 r eng5 +RI5 r iz5 +RONG5 r ong5 +ROU5 r ou5 +RU5 r u5 +RUAN5 r uan5 +RUI5 r ui5 +RUN5 r un5 +RUO5 r uo5 +SA5 s a5 +SAI5 s ai5 +SAN5 s an5 +SANG5 s ang5 +SAO5 s ao5 +SE5 s e5 +SEN5 s en5 +SENG5 s eng5 +SHA5 sh a5 +SHAI5 sh ai5 +SHAN5 sh an5 +SHANG5 sh ang5 +SHAO5 sh ao5 +SHE5 sh e5 +SHEI5 sh ei5 +SHEN5 sh en5 +SHENG5 sh eng5 +SHI5 sh ix5 +SHOU5 sh ou5 +SHU5 sh u5 +SHUA5 sh ua5 +SHUAI5 sh uai5 +SHUAN5 sh uan5 +SHUANG5 sh uang5 +SHUI5 sh ui5 +SHUN5 sh un5 +SHUO5 sh uo5 +SI5 s iy5 +SONG5 s ong5 +SOU5 s ou5 +SU5 s u5 +SUAN5 s uan5 +SUI5 s ui5 +SUN5 s un5 +SUO5 s uo5 +TA5 t a5 +TAI5 t ai5 +TAN5 t an5 +TANG5 t ang5 +TAO5 t ao5 +TE5 t e5 +TENG5 t eng5 +TI5 t i5 +TIAN5 t ian5 +TIAO5 t iao5 +TIE5 t ie5 +TING5 t ing5 +TONG5 t ong5 +TOU5 t ou5 +TU5 t u5 +TUAN5 t uan5 +TUI5 t ui5 +TUN5 t un5 +TUO5 t uo5 +WA5 uu ua5 +WAI5 uu uai5 +WAN5 uu uan5 +WANG5 uu uang5 +WEI5 uu ui5 +WEN5 uu un5 +WENG5 uu ueng5 +WO5 uu uo5 +WU5 uu u5 +XI5 x i5 +XIA5 x ia5 +XIAN5 x ian5 +XIANG5 x iang5 +XIAO5 x iao5 +XIE5 x ie5 +XIN5 x in5 +XING5 x ing5 +XIONG5 x iong5 +XIU5 x iu5 +XU5 x v5 +XUAN5 x van5 +XUE5 x ve5 +XUN5 x vn5 +YA5 ii ia5 +YAN5 ii ian5 +YANG5 ii iang5 +YAO5 ii iao5 +YE5 ii ie5 +YI5 ii i5 +YIN5 ii in5 +YING5 ii ing5 +YO5 ii ou5 +YONG5 ii iong5 +YOU5 ii iu5 +YU5 vv v5 +YUAN5 vv van5 +YUE5 vv ve5 +YUN5 vv vn5 +YUO5 ii ou5 +ZA5 z a5 +ZAI5 z ai5 +ZAN5 z an5 +ZANG5 z ang5 +ZAO5 z ao5 +ZE5 z e5 +ZEI5 z ei5 +ZEN5 z en5 +ZENG5 z eng5 +ZHA5 zh a5 +ZHAI5 zh ai5 +ZHAN5 zh an5 +ZHANG5 zh ang5 +ZHAO5 zh ao5 +ZHE5 zh e5 +ZHEI5 zh ei5 +ZHEN5 zh en5 +ZHENG5 zh eng5 +ZHI5 zh ix5 +ZHONG5 zh ong5 +ZHOU5 zh ou5 +ZHU5 zh u5 +ZHUA5 zh ua5 +ZHUAI5 zh uai5 +ZHUAN5 zh uan5 +ZHUANG5 zh uang5 +ZHUI5 zh ui5 +ZHUN5 zh un5 +ZHUO5 zh uo5 +ZI5 z iy5 +ZONG5 z ong5 +ZOU5 z ou5 +ZU5 z u5 +ZUAN5 z uan5 +ZUI5 z ui5 +ZUN5 z un5 +ZUO5 z uo5 +EI5 ee ei5 +TEI5 t ei5 +HNG5 ee eng5 +LO5 l o5 +N5 ee en5 +NG5 ee eng5 +NOU5 n ao5 +SEI5 s ei5 \ No newline at end of file diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/a0/local/data.sh new file mode 100644 index 000000000..169367acc --- /dev/null +++ b/examples/thchs30/a0/local/data.sh @@ -0,0 +1,47 @@ +#! /usr/bin/env bash + +stage=-1 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh + +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} +LEXICON_NAME=$1 + +# download data, generate manifests +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + python3 ${TARGET_DIR}/thchs30/thchs30.py \ + --manifest_prefix="data/manifest" \ + --target_dir="${TARGET_DIR}/thchs30" + + if [ $? -ne 0 ]; then + echo "Prepare THCHS-30 failed. Terminated." + exit 1 + fi + +fi + +# dump manifest to data/ +python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data + +# copy files to data/dict to gen word.lexicon +cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1 +cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2 + +# copy phone.lexicon to data/dict +cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon + +# gen word.lexicon +python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict + +# reorganize dataset for MFA +if [ ! -d $EXP_DIR/thchs30_corpus ]; then + echo "reorganizing thchs30 corpus..." + python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME + echo "reorganization done." +fi + +echo "THCHS-30 data preparation done." +exit 0 diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/a0/local/gen_word2phone.py new file mode 100644 index 000000000..cd584fcdc --- /dev/null +++ b/examples/thchs30/a0/local/gen_word2phone.py @@ -0,0 +1,94 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Gen Chinese characters to THCHS30-30 phone lexicon using THCHS30-30's lexicon +file1: THCHS-30/data_thchs30/lm_word/lexicon.txt +file2: THCHS-30/resource/dict/lexicon.txt +""" +import argparse +from collections import defaultdict +from pathlib import Path +from typing import Union + +# key: (cn, ('ee', 'er4')),value: count +cn_phones_counter = defaultdict(int) +# key: cn, value: list of (phones, num) +cn_counter = defaultdict(list) +# key: cn, value: list of (phones, probabilities) +cn_counter_p = defaultdict(list) + + +def is_Chinese(ch): + if '\u4e00' <= ch <= '\u9fff': + return True + return False + + +def proc_line(line): + line = line.strip() + if is_Chinese(line[0]): + line_list = line.split() + cn_list = line_list[0] + phone_list = line_list[1:] + if len(cn_list) == len(phone_list) / 2: + new_phone_list = [(phone_list[i], phone_list[i + 1]) + for i in range(0, len(phone_list), 2)] + assert len(cn_list) == len(new_phone_list) + for idx, cn in enumerate(cn_list): + phones = new_phone_list[idx] + cn_phones_counter[(cn, phones)] += 1 + + +def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]): + root_dir = Path(root_dir).expanduser() + output_dir = Path(output_dir).expanduser() + output_dir.mkdir(parents=True, exist_ok=True) + file1 = root_dir / "lm_word_lexicon_1" + file2 = root_dir / "lm_word_lexicon_2" + write_file = output_dir / "word.lexicon" + + with open(file1, "r") as f1: + for line in f1: + proc_line(line) + with open(file2, "r") as f2: + for line in f2: + proc_line(line) + for key in cn_phones_counter: + cn = key[0] + cn_counter[cn].append((key[1], cn_phones_counter[key])) + + for key in cn_counter: + phone_count_list = cn_counter[key] + count_sum = sum([x[1] for x in phone_count_list]) + for item in phone_count_list: + p = item[1] / count_sum + p = round(p, 2) + if p > 0: + cn_counter_p[key].append((item[0], p)) + with open(write_file, "w") as wf: + for key in cn_counter_p: + phone_p_list = cn_counter_p[key] + for item in phone_p_list: + phones, p = item + wf.write(key + " " + str(p) + " " + " ".join(phones) + "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Gen Chinese characters to phone lexicon for THCHS-30 dataset" + ) + parser.add_argument( + "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons") + parser.add_argument("--output-dir", type=str, help="path to save outputs") + args = parser.parse_args() + gen_lexicon(args.root_dir, args.output_dir) diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/a0/local/reorganize_thchs30.py new file mode 100644 index 000000000..9df6bc6a9 --- /dev/null +++ b/examples/thchs30/a0/local/reorganize_thchs30.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Recorganize THCHS-30 for MFA +read manifest.train from root-dir +Link *.wav to output-dir +dump *.lab from manifest.train, such as: text、syllable and phone +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +""" +import argparse +import os +from pathlib import Path +from typing import Union + + +def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]): + wav_scp_path = root_dir / 'wav.scp' + with open(wav_scp_path, 'r') as rf: + for line in rf: + utt, feat = line.strip().split() + wav_path = feat + wav_name = wav_path.split("/")[-1] + new_wav_path = output_dir / wav_name + os.symlink(wav_path, new_wav_path) + + +def write_lab(root_dir: Union[str, Path], + output_dir: Union[str, Path], + script_type='phone'): + # script_type can in {'word', 'syllable', 'phone'} + json_name = 'text.' + script_type + json_path = root_dir / json_name + with open(json_path, 'r') as rf: + for line in rf: + line = line.strip().split() + utt_id = line[0] + context = ' '.join(line[1:]) + transcript_name = utt_id + '.lab' + transcript_path = output_dir / transcript_name + with open(transcript_path, 'wt') as wf: + if script_type == 'word': + # add space between chinese char + context = ''.join([f + ' ' for f in context])[:-1] + wf.write(context + "\n") + + +def reorganize_thchs30(root_dir: Union[str, Path], + output_dir: Union[str, Path]=None, + script_type='phone'): + root_dir = Path(root_dir).expanduser() + output_dir = Path(output_dir).expanduser() + output_dir.mkdir(parents=True, exist_ok=True) + link_wav(root_dir, output_dir) + write_lab(root_dir, output_dir, script_type) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Reorganize THCHS-30 dataset for MFA") + parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.") + parser.add_argument( + "--output-dir", + type=str, + help="path to save outputs(audio and transcriptions)") + + parser.add_argument( + "--script-type", + type=str, + default="phone", + help="type of lab ('word'/'syllable'/'phone')") + args = parser.parse_args() + reorganize_thchs30(args.root_dir, args.output_dir, args.script_type) diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/a0/path.sh new file mode 100644 index 000000000..08e13c199 --- /dev/null +++ b/examples/thchs30/a0/path.sh @@ -0,0 +1,13 @@ +export MAIN_ROOT=${PWD}/../../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# MFA is in tools +export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH \ No newline at end of file diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/a0/run.sh new file mode 100755 index 000000000..53f96b378 --- /dev/null +++ b/examples/thchs30/a0/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -e +source path.sh +stage=0 +stop_stage=100 +EXP_DIR=exp +# LEXICON_NAME in {'phone', 'syllable', 'word'} +LEXICON_NAME='phone' +# set MFA num_jobs as half of machine's cpu core number +NUM_JOBS=$((`nproc`/2)) +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +# download dataset、unzip and generate manifest +# gen lexicon relink gen dump +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh $LEXICON_NAME|| exit -1 +fi + +# run MFA +if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then + echo "Start MFA training..." + mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS + echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n" +fi + + + + + + + diff --git a/tools/Makefile b/tools/Makefile index 08cb9955c..62cf990fa 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,7 +1,8 @@ +SHELL:= /bin/bash PYTHON:= python3.7 .PHONY: all clean -all: virtualenv kenlm.done sox.done soxbindings.done +all: virtualenv kenlm.done sox.done soxbindings.done mfa.done virtualenv: test -d venv || virtualenv -p $(PYTHON) venv @@ -33,3 +34,8 @@ soxbindings.done: test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git source venv/bin/activate; cd soxbindings && python setup.py install touch soxbindings.done + +mfa.done: + test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz + tar xvf montreal-forced-aligner_linux.tar.gz + touch mfa.done diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py new file mode 100644 index 000000000..4f115590a --- /dev/null +++ b/utils/dump_manifest.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""format manifest into wav.scp text.word [text.syllable text.phone]""" +import argparse +from pathlib import Path +from typing import Union + +from deepspeech.frontend.utility import read_manifest + +key_whitelist = set(['feat', 'text', 'syllable', 'phone']) +filename = { + 'text': 'text.word', + 'syllable': 'text.syllable', + 'phone': 'text.phone', + 'feat': 'wav.scp', +} + + +def dump_manifest(manifest_path, output_dir: Union[str, Path]): + + output_dir = Path(output_dir).expanduser() + manifest_path = Path(manifest_path).expanduser() + manifest_jsons = read_manifest(manifest_path) + first_line = manifest_jsons[0] + file_map = {} + + for k in first_line.keys(): + if k not in key_whitelist: + continue + file_map[k] = open(output_dir / filename[k], 'w') + + for line_json in manifest_jsons: + for k in line_json.keys(): + if k not in key_whitelist: + continue + file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n') + + for _, file in file_map.items(): + file.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="dump manifest to wav.scp text.word ...") + parser.add_argument("--manifest-path", type=str, help="path to manifest") + parser.add_argument( + "--output-dir", + type=str, + help="path to save outputs(audio and transcriptions)") + args = parser.parse_args() + dump_manifest(args.manifest_path, args.output_dir)