add mix frontend test

pull/3316/head
Hui Zhang 1 year ago
parent 40124ed34f
commit 108e73e1a0

@ -18,9 +18,9 @@ from typing import List
import numpy as np
import paddle
from paddlespeech.t2s.frontend.en_frontend import EnFrontend
from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
from paddlespeech.t2s.frontend.zh_frontend import ZhFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend
class MixFrontend():
@ -28,7 +28,6 @@ class MixFrontend():
g2p_model="pypinyin",
phone_vocab_path=None,
tone_vocab_path=None):
self.zh_frontend = ZhFrontend(
phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path)
@ -55,15 +54,12 @@ class MixFrontend():
else:
return False
def get_segment(self, text: str) -> List[str]:
def split_by_lang(self, text: str) -> List[str]:
# sentence --> [ch_part, en_part, ch_part, ...]
segments = []
types = []
flag = 0
temp_seg = ""
temp_lang = ""
# Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
# Determine the type of each character. type: chinese, alphabet, other.
for ch in text:
if self.is_chinese(ch):
types.append("zh")
@ -74,31 +70,31 @@ class MixFrontend():
assert len(types) == len(text)
for i in range(len(types)):
flag = 0
temp_seg = ""
temp_lang = ""
for i in range(len(text)):
# find the first char of the seg
if flag == 0:
temp_seg += text[i]
temp_lang = types[i]
flag = 1
else:
if temp_lang == "other":
if types[i] == temp_lang:
temp_seg += text[i]
else:
temp_seg += text[i]
# text start is not lang.
temp_seg += text[i]
if types[i] != temp_lang:
temp_lang = types[i]
else:
if types[i] == temp_lang:
temp_seg += text[i]
elif types[i] == "other":
if types[i] == temp_lang or types[i] == "other":
# merge same lang or other
temp_seg += text[i]
else:
# change lang
segments.append((temp_seg, temp_lang))
temp_seg = text[i]
temp_lang = types[i]
flag = 1
temp_lang = types[i] # new lang
segments.append((temp_seg, temp_lang))
@ -120,7 +116,7 @@ class MixFrontend():
if instr.lower().startswith("<say-as"):
tmpSegments.append((instr, "zh"))
else:
tmpSegments.extend(self.get_segment(instr))
tmpSegments.extend(self.split_by_lang(instr))
''' 2. 把zh的merge到一起避免合成结果中间停顿
'''
segments = []

@ -0,0 +1,431 @@
import re
import tempfile
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
# mix zh & en phonemes
phone_id_str = """
<pad> 0
<unk> 1
AA0 2
AA1 3
AA2 4
AE0 5
AE1 6
AE2 7
AH0 8
AH1 9
AH2 10
AO0 11
AO1 12
AO2 13
AW0 14
AW1 15
AW2 16
AY0 17
AY1 18
AY2 19
B 20
CH 21
D 22
DH 23
EH0 24
EH1 25
EH2 26
ER0 27
ER1 28
ER2 29
EY0 30
EY1 31
EY2 32
F 33
G 34
HH 35
IH0 36
IH1 37
IH2 38
IY0 39
IY1 40
IY2 41
JH 42
K 43
L 44
M 45
N 46
NG 47
OW0 48
OW1 49
OW2 50
OY0 51
OY1 52
OY2 53
P 54
R 55
S 56
SH 57
T 58
TH 59
UH0 60
UH1 61
UH2 62
UW0 63
UW1 64
UW2 65
V 66
W 67
Y 68
Z 69
ZH 70
a1 71
a2 72
a3 73
a4 74
a5 75
ai1 76
ai2 77
ai3 78
ai4 79
ai5 80
air2 81
air3 82
air4 83
an1 84
an2 85
an3 86
an4 87
an5 88
ang1 89
ang2 90
ang3 91
ang4 92
ang5 93
angr2 94
angr4 95
anr1 96
anr3 97
anr4 98
ao1 99
ao2 100
ao3 101
ao4 102
ao5 103
aor1 104
aor3 105
aor4 106
aor5 107
ar2 108
ar3 109
ar4 110
ar5 111
b 112
c 113
ch 114
d 115
e1 116
e2 117
e3 118
e4 119
e5 120
ei1 121
ei2 122
ei3 123
ei4 124
ei5 125
eir4 126
en1 127
en2 128
en3 129
en4 130
en5 131
eng1 132
eng2 133
eng3 134
eng4 135
eng5 136
engr4 137
enr1 138
enr2 139
enr3 140
enr4 141
enr5 142
er1 143
er2 144
er3 145
er4 146
er5 147
f 148
g 149
h 150
i1 151
i2 152
i3 153
i4 154
i5 155
ia1 156
ia2 157
ia3 158
ia4 159
ia5 160
ian1 161
ian2 162
ian3 163
ian4 164
ian5 165
iang1 166
iang2 167
iang3 168
iang4 169
iang5 170
iangr4 171
ianr1 172
ianr2 173
ianr3 174
ianr4 175
ianr5 176
iao1 177
iao2 178
iao3 179
iao4 180
iao5 181
iaor1 182
iaor2 183
iaor3 184
iaor4 185
iar1 186
iar3 187
iar4 188
ie1 189
ie2 190
ie3 191
ie4 192
ie5 193
ii1 194
ii2 195
ii3 196
ii4 197
ii5 198
iii1 199
iii2 200
iii3 201
iii4 202
iii5 203
iiir1 204
iiir4 205
iir2 206
in1 207
in2 208
in3 209
in4 210
in5 211
ing1 212
ing2 213
ing3 214
ing4 215
ing5 216
ingr1 217
ingr2 218
ingr3 219
ingr4 220
inr1 221
inr4 222
io1 223
io3 224
io5 225
iong1 226
iong2 227
iong3 228
iong4 229
iong5 230
iou1 231
iou2 232
iou3 233
iou4 234
iou5 235
iour1 236
iour2 237
iour3 238
iour4 239
ir1 240
ir2 241
ir3 242
ir4 243
ir5 244
j 245
k 246
l 247
m 248
n 249
o1 250
o2 251
o3 252
o4 253
o5 254
ong1 255
ong2 256
ong3 257
ong4 258
ong5 259
ongr4 260
or2 261
ou1 262
ou2 263
ou3 264
ou4 265
ou5 266
our2 267
our3 268
our4 269
our5 270
p 271
q 272
r 273
s 274
sh 275
sil 276
sp 277
spl 278
spn 279
t 280
u1 281
u2 282
u3 283
u4 284
u5 285
ua1 286
ua2 287
ua3 288
ua4 289
ua5 290
uai1 291
uai2 292
uai3 293
uai4 294
uai5 295
uair4 296
uan1 297
uan2 298
uan3 299
uan4 300
uan5 301
uang1 302
uang2 303
uang3 304
uang4 305
uang5 306
uangr4 307
uanr1 308
uanr2 309
uanr3 310
uanr4 311
uanr5 312
uar1 313
uar2 314
uar4 315
uei1 316
uei2 317
uei3 318
uei4 319
uei5 320
ueir1 321
ueir2 322
ueir3 323
ueir4 324
uen1 325
uen2 326
uen3 327
uen4 328
uen5 329
ueng1 330
ueng2 331
ueng3 332
ueng4 333
uenr1 334
uenr2 335
uenr3 336
uenr4 337
uo1 338
uo2 339
uo3 340
uo4 341
uo5 342
uor1 343
uor2 344
uor3 345
uor5 346
ur1 347
ur2 348
ur3 349
ur4 350
ur5 351
v1 352
v2 353
v3 354
v4 355
v5 356
van1 357
van2 358
van3 359
van4 360
van5 361
vanr1 362
vanr2 363
vanr3 364
vanr4 365
ve1 366
ve2 367
ve3 368
ve4 369
ve5 370
ver3 371
ver4 372
vn1 373
vn2 374
vn3 375
vn4 376
vn5 377
vnr2 378
vr3 379
x 380
z 381
zh 382
, 383
. 384
? 385
! 386
<eos> 387
"""
if __name__ == '__main__':
with tempfile.NamedTemporaryFile(mode='wt') as f:
phone_ids = phone_id_str.split()
for phone, id in zip(phone_ids[::2], phone_ids[1::2]):
f.write(f"{phone} {id}")
f.write('\n')
f.flush()
frontend = MixFrontend(phone_vocab_path=f.name)
text = "hello, 我爱北京天安们what about you."
print(text)
# [('hello, ', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
text = "hello?!!我爱北京天安们what about you."
print(text)
# [('hello?!!', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
text = "<speak> hello我爱北京天安们what about you."
print(text)
# [('<speak> hello', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
# 对于SSML的xml标记处理不好。
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
print(text)
# [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
Loading…
Cancel
Save