commit
55122cfc86
@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/inference_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out_streaming \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/inference_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out_streaming \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/inference_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out_streaming \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
||||||
|
|
@ -0,0 +1,19 @@
|
|||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# e2e, synthesize from text
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../ort_predict_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/inference_onnx_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--output_dir=${train_output_path}/onnx_infer_out_streaming \
|
||||||
|
--text=${BIN_DIR}/../csmsc_test.txt \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--device=cpu \
|
||||||
|
--cpu_threads=2 \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
@ -0,0 +1,95 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
pretrained_models = {
|
||||||
|
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
|
||||||
|
# e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
|
||||||
|
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
|
||||||
|
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
|
||||||
|
"conformer_wenetspeech-zh-16k": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'76cb19ed857e6623856b7cd7ebbfeda4',
|
||||||
|
'cfg_path':
|
||||||
|
'model.yaml',
|
||||||
|
'ckpt_path':
|
||||||
|
'exp/conformer/checkpoints/wenetspeech',
|
||||||
|
},
|
||||||
|
"transformer_librispeech-en-16k": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'2c667da24922aad391eacafe37bc1660',
|
||||||
|
'cfg_path':
|
||||||
|
'model.yaml',
|
||||||
|
'ckpt_path':
|
||||||
|
'exp/transformer/checkpoints/avg_10',
|
||||||
|
},
|
||||||
|
"deepspeech2offline_aishell-zh-16k": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'932c3593d62fe5c741b59b31318aa314',
|
||||||
|
'cfg_path':
|
||||||
|
'model.yaml',
|
||||||
|
'ckpt_path':
|
||||||
|
'exp/deepspeech2/checkpoints/avg_1',
|
||||||
|
'lm_url':
|
||||||
|
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
|
||||||
|
'lm_md5':
|
||||||
|
'29e02312deb2e59b3c8686c7966d4fe3'
|
||||||
|
},
|
||||||
|
"deepspeech2online_aishell-zh-16k": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'23e16c69730a1cb5d735c98c83c21e16',
|
||||||
|
'cfg_path':
|
||||||
|
'model.yaml',
|
||||||
|
'ckpt_path':
|
||||||
|
'exp/deepspeech2_online/checkpoints/avg_1',
|
||||||
|
'lm_url':
|
||||||
|
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
|
||||||
|
'lm_md5':
|
||||||
|
'29e02312deb2e59b3c8686c7966d4fe3'
|
||||||
|
},
|
||||||
|
"deepspeech2offline_librispeech-en-16k": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'f5666c81ad015c8de03aac2bc92e5762',
|
||||||
|
'cfg_path':
|
||||||
|
'model.yaml',
|
||||||
|
'ckpt_path':
|
||||||
|
'exp/deepspeech2/checkpoints/avg_1',
|
||||||
|
'lm_url':
|
||||||
|
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
|
||||||
|
'lm_md5':
|
||||||
|
'099a601759d467cd0a8523ff939819c5'
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
model_alias = {
|
||||||
|
"deepspeech2offline":
|
||||||
|
"paddlespeech.s2t.models.ds2:DeepSpeech2Model",
|
||||||
|
"deepspeech2online":
|
||||||
|
"paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
|
||||||
|
"conformer":
|
||||||
|
"paddlespeech.s2t.models.u2:U2Model",
|
||||||
|
"transformer":
|
||||||
|
"paddlespeech.s2t.models.u2:U2Model",
|
||||||
|
"wenetspeech":
|
||||||
|
"paddlespeech.s2t.models.u2:U2Model",
|
||||||
|
}
|
@ -0,0 +1,47 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
pretrained_models = {
|
||||||
|
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
|
||||||
|
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
|
||||||
|
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
|
||||||
|
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
|
||||||
|
"panns_cnn6-32k": {
|
||||||
|
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
|
||||||
|
'md5': '4cf09194a95df024fd12f84712cf0f9c',
|
||||||
|
'cfg_path': 'panns.yaml',
|
||||||
|
'ckpt_path': 'cnn6.pdparams',
|
||||||
|
'label_file': 'audioset_labels.txt',
|
||||||
|
},
|
||||||
|
"panns_cnn10-32k": {
|
||||||
|
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
|
||||||
|
'md5': 'cb8427b22176cc2116367d14847f5413',
|
||||||
|
'cfg_path': 'panns.yaml',
|
||||||
|
'ckpt_path': 'cnn10.pdparams',
|
||||||
|
'label_file': 'audioset_labels.txt',
|
||||||
|
},
|
||||||
|
"panns_cnn14-32k": {
|
||||||
|
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
|
||||||
|
'md5': 'e3b9b5614a1595001161d0ab95edee97',
|
||||||
|
'cfg_path': 'panns.yaml',
|
||||||
|
'ckpt_path': 'cnn14.pdparams',
|
||||||
|
'label_file': 'audioset_labels.txt',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
model_alias = {
|
||||||
|
"panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
|
||||||
|
"panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
|
||||||
|
"panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
pretrained_models = {
|
||||||
|
"fat_st_ted-en-zh": {
|
||||||
|
"url":
|
||||||
|
"https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
|
||||||
|
"md5":
|
||||||
|
"d62063f35a16d91210a71081bd2dd557",
|
||||||
|
"cfg_path":
|
||||||
|
"model.yaml",
|
||||||
|
"ckpt_path":
|
||||||
|
"exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
|
||||||
|
|
||||||
|
kaldi_bins = {
|
||||||
|
"url":
|
||||||
|
"https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
|
||||||
|
"md5":
|
||||||
|
"c0682303b3f3393dbf6ed4c4e35a53eb",
|
||||||
|
}
|
@ -0,0 +1,54 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
pretrained_models = {
|
||||||
|
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
|
||||||
|
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
|
||||||
|
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
|
||||||
|
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
|
||||||
|
"ernie_linear_p7_wudao-punc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'12283e2ddde1797c5d1e57036b512746',
|
||||||
|
'cfg_path':
|
||||||
|
'ckpt/model_config.json',
|
||||||
|
'ckpt_path':
|
||||||
|
'ckpt/model_state.pdparams',
|
||||||
|
'vocab_file':
|
||||||
|
'punc_vocab.txt',
|
||||||
|
},
|
||||||
|
"ernie_linear_p3_wudao-punc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'448eb2fdf85b6a997e7e652e80c51dd2',
|
||||||
|
'cfg_path':
|
||||||
|
'ckpt/model_config.json',
|
||||||
|
'ckpt_path':
|
||||||
|
'ckpt/model_state.pdparams',
|
||||||
|
'vocab_file':
|
||||||
|
'punc_vocab.txt',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
model_alias = {
|
||||||
|
"ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
|
||||||
|
"ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer_alias = {
|
||||||
|
"ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
|
||||||
|
"ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
|
||||||
|
}
|
@ -0,0 +1,300 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
pretrained_models = {
|
||||||
|
# speedyspeech
|
||||||
|
"speedyspeech_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'6f6fa967b408454b6662c8c00c0027cb',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_30600.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
'tones_dict':
|
||||||
|
'tone_id_map.txt',
|
||||||
|
},
|
||||||
|
|
||||||
|
# fastspeech2
|
||||||
|
"fastspeech2_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
|
||||||
|
'md5':
|
||||||
|
'637d28a5e53aa60275612ba4393d5f22',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_76000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'speech_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
},
|
||||||
|
"fastspeech2_ljspeech-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
|
||||||
|
'md5':
|
||||||
|
'ffed800c93deaf16ca9b3af89bfcd747',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_100000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'speech_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
},
|
||||||
|
"fastspeech2_aishell3-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
|
||||||
|
'md5':
|
||||||
|
'f4dd4a5f49a4552b77981f544ab3392e',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_96400.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'speech_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
'speaker_dict':
|
||||||
|
'speaker_id_map.txt',
|
||||||
|
},
|
||||||
|
"fastspeech2_vctk-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
|
||||||
|
'md5':
|
||||||
|
'743e5024ca1e17a88c5c271db9779ba4',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_66200.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'speech_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
'speaker_dict':
|
||||||
|
'speaker_id_map.txt',
|
||||||
|
},
|
||||||
|
# tacotron2
|
||||||
|
"tacotron2_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'0df4b6f0bcbe0d73c5ed6df8867ab91a',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_30600.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'speech_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
},
|
||||||
|
"tacotron2_ljspeech-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'6a5eddd81ae0e81d16959b97481135f3',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_60300.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'speech_stats.npy',
|
||||||
|
'phones_dict':
|
||||||
|
'phone_id_map.txt',
|
||||||
|
},
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
"pwgan_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
|
||||||
|
'md5':
|
||||||
|
'2e481633325b5bdf0a3823c714d2c117',
|
||||||
|
'config':
|
||||||
|
'pwg_default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'pwg_snapshot_iter_400000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'pwg_stats.npy',
|
||||||
|
},
|
||||||
|
"pwgan_ljspeech-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
|
||||||
|
'md5':
|
||||||
|
'53610ba9708fd3008ccaf8e99dacbaf0',
|
||||||
|
'config':
|
||||||
|
'pwg_default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'pwg_snapshot_iter_400000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'pwg_stats.npy',
|
||||||
|
},
|
||||||
|
"pwgan_aishell3-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
|
||||||
|
'md5':
|
||||||
|
'd7598fa41ad362d62f85ffc0f07e3d84',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_1000000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
"pwgan_vctk-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
|
||||||
|
'md5':
|
||||||
|
'b3da1defcde3e578be71eb284cb89f2c',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_1500000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
# mb_melgan
|
||||||
|
"mb_melgan_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
|
||||||
|
'md5':
|
||||||
|
'ee5f0604e20091f0d495b6ec4618b90d',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_1000000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
# style_melgan
|
||||||
|
"style_melgan_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
|
||||||
|
'md5':
|
||||||
|
'5de2d5348f396de0c966926b8c462755',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_1500000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
# hifigan
|
||||||
|
"hifigan_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
|
||||||
|
'md5':
|
||||||
|
'dd40a3d88dfcf64513fba2f0f961ada6',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_2500000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
"hifigan_ljspeech-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'70e9131695decbca06a65fe51ed38a72',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_2500000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
"hifigan_aishell3-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'3bb49bc75032ed12f79c00c8cc79a09a',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_2500000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
"hifigan_vctk-en": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'7da8f88359bca2457e705d924cf27bd4',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_2500000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
},
|
||||||
|
|
||||||
|
# wavernn
|
||||||
|
"wavernn_csmsc-zh": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
|
||||||
|
'md5':
|
||||||
|
'ee37b752f09bcba8f2af3b777ca38e13',
|
||||||
|
'config':
|
||||||
|
'default.yaml',
|
||||||
|
'ckpt':
|
||||||
|
'snapshot_iter_400000.pdz',
|
||||||
|
'speech_stats':
|
||||||
|
'feats_stats.npy',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
model_alias = {
|
||||||
|
# acoustic model
|
||||||
|
"speedyspeech":
|
||||||
|
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
|
||||||
|
"speedyspeech_inference":
|
||||||
|
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
|
||||||
|
"fastspeech2":
|
||||||
|
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
|
||||||
|
"fastspeech2_inference":
|
||||||
|
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
|
||||||
|
"tacotron2":
|
||||||
|
"paddlespeech.t2s.models.tacotron2:Tacotron2",
|
||||||
|
"tacotron2_inference":
|
||||||
|
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
|
||||||
|
# voc
|
||||||
|
"pwgan":
|
||||||
|
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
|
||||||
|
"pwgan_inference":
|
||||||
|
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
|
||||||
|
"mb_melgan":
|
||||||
|
"paddlespeech.t2s.models.melgan:MelGANGenerator",
|
||||||
|
"mb_melgan_inference":
|
||||||
|
"paddlespeech.t2s.models.melgan:MelGANInference",
|
||||||
|
"style_melgan":
|
||||||
|
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
|
||||||
|
"style_melgan_inference":
|
||||||
|
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
|
||||||
|
"hifigan":
|
||||||
|
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
|
||||||
|
"hifigan_inference":
|
||||||
|
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
|
||||||
|
"wavernn":
|
||||||
|
"paddlespeech.t2s.models.wavernn:WaveRNN",
|
||||||
|
"wavernn_inference":
|
||||||
|
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
|
||||||
|
}
|
@ -0,0 +1,36 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
pretrained_models = {
|
||||||
|
# The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
|
||||||
|
# e.g. "ecapatdnn_voxceleb12-16k".
|
||||||
|
# Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
|
||||||
|
# "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
|
||||||
|
"ecapatdnn_voxceleb12-16k": {
|
||||||
|
'url':
|
||||||
|
'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
|
||||||
|
'md5':
|
||||||
|
'cc33023c54ab346cd318408f43fcaf95',
|
||||||
|
'cfg_path':
|
||||||
|
'conf/model.yaml', # the yaml config path
|
||||||
|
'ckpt_path':
|
||||||
|
'model/model', # the format is ${dir}/{model_name},
|
||||||
|
# so the first 'model' is dir, the second 'model' is the name
|
||||||
|
# this means we have a model stored as model/model.pdparams
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
model_alias = {
|
||||||
|
"ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
|
||||||
|
}
|
@ -0,0 +1,224 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
from timer import timer
|
||||||
|
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import denorm
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_am_sublayer_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_chunks
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_frontend
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_predictor
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_sentences
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_streaming_am_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_streaming_am_predictor
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_voc_output
|
||||||
|
from paddlespeech.t2s.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Paddle Infernce with acoustic model & vocoder.")
|
||||||
|
# acoustic model
|
||||||
|
parser.add_argument(
|
||||||
|
'--am',
|
||||||
|
type=str,
|
||||||
|
default='fastspeech2_csmsc',
|
||||||
|
choices=['fastspeech2_csmsc'],
|
||||||
|
help='Choose acoustic model type of tts task.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--am_stat",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training acoustic model."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speaker_dict", type=str, default=None, help="speaker id map file.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--spk_id',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='spk id for multi speaker acoustic model')
|
||||||
|
# voc
|
||||||
|
parser.add_argument(
|
||||||
|
'--voc',
|
||||||
|
type=str,
|
||||||
|
default='pwgan_csmsc',
|
||||||
|
choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
|
||||||
|
help='Choose vocoder type of tts task.')
|
||||||
|
# other
|
||||||
|
parser.add_argument(
|
||||||
|
'--lang',
|
||||||
|
type=str,
|
||||||
|
default='zh',
|
||||||
|
help='Choose model language. zh or en')
|
||||||
|
parser.add_argument(
|
||||||
|
"--text",
|
||||||
|
type=str,
|
||||||
|
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||||
|
parser.add_argument(
|
||||||
|
"--inference_dir", type=str, help="dir to save inference models")
|
||||||
|
parser.add_argument("--output_dir", type=str, help="output dir")
|
||||||
|
# inference
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
default="gpu",
|
||||||
|
choices=["gpu", "cpu"],
|
||||||
|
help="Device selected for inference.", )
|
||||||
|
# streaming related
|
||||||
|
parser.add_argument(
|
||||||
|
"--am_streaming",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="whether use streaming acoustic model")
|
||||||
|
parser.add_argument(
|
||||||
|
"--chunk_size", type=int, default=42, help="chunk size of am streaming")
|
||||||
|
parser.add_argument(
|
||||||
|
"--pad_size", type=int, default=12, help="pad size of am streaming")
|
||||||
|
|
||||||
|
args, _ = parser.parse_known_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
# only inference for models trained with csmsc now
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
# frontend
|
||||||
|
frontend = get_frontend(args)
|
||||||
|
|
||||||
|
# am_predictor
|
||||||
|
am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor = get_streaming_am_predictor(
|
||||||
|
args)
|
||||||
|
am_mu, am_std = np.load(args.am_stat)
|
||||||
|
# model: {model_name}_{dataset}
|
||||||
|
am_dataset = args.am[args.am.rindex('_') + 1:]
|
||||||
|
|
||||||
|
# voc_predictor
|
||||||
|
voc_predictor = get_predictor(args, filed='voc')
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
sentences = get_sentences(args)
|
||||||
|
|
||||||
|
merge_sentences = True
|
||||||
|
|
||||||
|
fs = 24000 if am_dataset != 'ljspeech' else 22050
|
||||||
|
# warmup
|
||||||
|
for utt_id, sentence in sentences[:3]:
|
||||||
|
with timer() as t:
|
||||||
|
normalized_mel = get_streaming_am_output(
|
||||||
|
args,
|
||||||
|
am_encoder_infer_predictor=am_encoder_infer_predictor,
|
||||||
|
am_decoder_predictor=am_decoder_predictor,
|
||||||
|
am_postnet_predictor=am_postnet_predictor,
|
||||||
|
frontend=frontend,
|
||||||
|
merge_sentences=merge_sentences,
|
||||||
|
input=sentence)
|
||||||
|
mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||||
|
speed = wav.size / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
|
||||||
|
print("warm up done!")
|
||||||
|
|
||||||
|
N = 0
|
||||||
|
T = 0
|
||||||
|
chunk_size = args.chunk_size
|
||||||
|
pad_size = args.pad_size
|
||||||
|
get_tone_ids = False
|
||||||
|
for utt_id, sentence in sentences:
|
||||||
|
with timer() as t:
|
||||||
|
# frontend
|
||||||
|
if args.lang == 'zh':
|
||||||
|
input_ids = frontend.get_input_ids(
|
||||||
|
sentence,
|
||||||
|
merge_sentences=merge_sentences,
|
||||||
|
get_tone_ids=get_tone_ids)
|
||||||
|
phone_ids = input_ids["phone_ids"]
|
||||||
|
else:
|
||||||
|
print("lang should be 'zh' here!")
|
||||||
|
phones = phone_ids[0].numpy()
|
||||||
|
# acoustic model
|
||||||
|
orig_hs = get_am_sublayer_output(
|
||||||
|
am_encoder_infer_predictor, input=phones)
|
||||||
|
|
||||||
|
if args.am_streaming:
|
||||||
|
hss = get_chunks(orig_hs, chunk_size, pad_size)
|
||||||
|
chunk_num = len(hss)
|
||||||
|
mel_list = []
|
||||||
|
for i, hs in enumerate(hss):
|
||||||
|
am_decoder_output = get_am_sublayer_output(
|
||||||
|
am_decoder_predictor, input=hs)
|
||||||
|
am_postnet_output = get_am_sublayer_output(
|
||||||
|
am_postnet_predictor,
|
||||||
|
input=np.transpose(am_decoder_output, (0, 2, 1)))
|
||||||
|
am_output_data = am_decoder_output + np.transpose(
|
||||||
|
am_postnet_output, (0, 2, 1))
|
||||||
|
normalized_mel = am_output_data[0]
|
||||||
|
|
||||||
|
sub_mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
# clip output part of pad
|
||||||
|
if i == 0:
|
||||||
|
sub_mel = sub_mel[:-pad_size]
|
||||||
|
elif i == chunk_num - 1:
|
||||||
|
# 最后一块的右侧一定没有 pad 够
|
||||||
|
sub_mel = sub_mel[pad_size:]
|
||||||
|
else:
|
||||||
|
# 倒数几块的右侧也可能没有 pad 够
|
||||||
|
sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
|
||||||
|
sub_mel.shape[0]]
|
||||||
|
mel_list.append(sub_mel)
|
||||||
|
mel = np.concatenate(mel_list, axis=0)
|
||||||
|
|
||||||
|
else:
|
||||||
|
am_decoder_output = get_am_sublayer_output(
|
||||||
|
am_decoder_predictor, input=orig_hs)
|
||||||
|
|
||||||
|
am_postnet_output = get_am_sublayer_output(
|
||||||
|
am_postnet_predictor,
|
||||||
|
input=np.transpose(am_decoder_output, (0, 2, 1)))
|
||||||
|
am_output_data = am_decoder_output + np.transpose(
|
||||||
|
am_postnet_output, (0, 2, 1))
|
||||||
|
normalized_mel = am_output_data[0]
|
||||||
|
mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
# vocoder
|
||||||
|
wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||||
|
|
||||||
|
N += wav.size
|
||||||
|
T += t.elapse
|
||||||
|
speed = wav.size / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
|
||||||
|
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"{utt_id} done!")
|
||||||
|
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,233 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
from timer import timer
|
||||||
|
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import denorm
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_chunks
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_frontend
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_sentences
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_sess
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_streaming_am_sess
|
||||||
|
from paddlespeech.t2s.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
def ort_predict(args):
|
||||||
|
|
||||||
|
# frontend
|
||||||
|
frontend = get_frontend(args)
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
sentences = get_sentences(args)
|
||||||
|
|
||||||
|
am_name = args.am[:args.am.rindex('_')]
|
||||||
|
am_dataset = args.am[args.am.rindex('_') + 1:]
|
||||||
|
fs = 24000 if am_dataset != 'ljspeech' else 22050
|
||||||
|
|
||||||
|
# am
|
||||||
|
am_encoder_infer_sess, am_decoder_sess, am_postnet_sess = get_streaming_am_sess(
|
||||||
|
args)
|
||||||
|
am_mu, am_std = np.load(args.am_stat)
|
||||||
|
|
||||||
|
# vocoder
|
||||||
|
voc_sess = get_sess(args, filed='voc')
|
||||||
|
|
||||||
|
# frontend warmup
|
||||||
|
# Loading model cost 0.5+ seconds
|
||||||
|
if args.lang == 'zh':
|
||||||
|
frontend.get_input_ids("你好,欢迎使用飞桨框架进行深度学习研究!", merge_sentences=True)
|
||||||
|
else:
|
||||||
|
print("lang should in be 'zh' here!")
|
||||||
|
|
||||||
|
# am warmup
|
||||||
|
for T in [27, 38, 54]:
|
||||||
|
phone_ids = np.random.randint(1, 266, size=(T, ))
|
||||||
|
am_encoder_infer_sess.run(None, input_feed={'text': phone_ids})
|
||||||
|
|
||||||
|
am_decoder_input = np.random.rand(1, T * 15, 384).astype('float32')
|
||||||
|
am_decoder_sess.run(None, input_feed={'xs': am_decoder_input})
|
||||||
|
|
||||||
|
am_postnet_input = np.random.rand(1, 80, T * 15).astype('float32')
|
||||||
|
am_postnet_sess.run(None, input_feed={'xs': am_postnet_input})
|
||||||
|
|
||||||
|
# voc warmup
|
||||||
|
for T in [227, 308, 544]:
|
||||||
|
data = np.random.rand(T, 80).astype("float32")
|
||||||
|
voc_sess.run(None, input_feed={"logmel": data})
|
||||||
|
print("warm up done!")
|
||||||
|
|
||||||
|
N = 0
|
||||||
|
T = 0
|
||||||
|
merge_sentences = True
|
||||||
|
get_tone_ids = False
|
||||||
|
chunk_size = args.chunk_size
|
||||||
|
pad_size = args.pad_size
|
||||||
|
|
||||||
|
for utt_id, sentence in sentences:
|
||||||
|
with timer() as t:
|
||||||
|
if args.lang == 'zh':
|
||||||
|
input_ids = frontend.get_input_ids(
|
||||||
|
sentence,
|
||||||
|
merge_sentences=merge_sentences,
|
||||||
|
get_tone_ids=get_tone_ids)
|
||||||
|
phone_ids = input_ids["phone_ids"]
|
||||||
|
else:
|
||||||
|
print("lang should in be 'zh' here!")
|
||||||
|
# merge_sentences=True here, so we only use the first item of phone_ids
|
||||||
|
phone_ids = phone_ids[0].numpy()
|
||||||
|
orig_hs = am_encoder_infer_sess.run(
|
||||||
|
None, input_feed={'text': phone_ids})
|
||||||
|
if args.am_streaming:
|
||||||
|
hss = get_chunks(orig_hs[0], chunk_size, pad_size)
|
||||||
|
chunk_num = len(hss)
|
||||||
|
mel_list = []
|
||||||
|
for i, hs in enumerate(hss):
|
||||||
|
am_decoder_output = am_decoder_sess.run(
|
||||||
|
None, input_feed={'xs': hs})
|
||||||
|
am_postnet_output = am_postnet_sess.run(
|
||||||
|
None,
|
||||||
|
input_feed={
|
||||||
|
'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
|
||||||
|
})
|
||||||
|
am_output_data = am_decoder_output + np.transpose(
|
||||||
|
am_postnet_output[0], (0, 2, 1))
|
||||||
|
normalized_mel = am_output_data[0][0]
|
||||||
|
|
||||||
|
sub_mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
# clip output part of pad
|
||||||
|
if i == 0:
|
||||||
|
sub_mel = sub_mel[:-pad_size]
|
||||||
|
elif i == chunk_num - 1:
|
||||||
|
# 最后一块的右侧一定没有 pad 够
|
||||||
|
sub_mel = sub_mel[pad_size:]
|
||||||
|
else:
|
||||||
|
# 倒数几块的右侧也可能没有 pad 够
|
||||||
|
sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
|
||||||
|
sub_mel.shape[0]]
|
||||||
|
mel_list.append(sub_mel)
|
||||||
|
mel = np.concatenate(mel_list, axis=0)
|
||||||
|
else:
|
||||||
|
am_decoder_output = am_decoder_sess.run(
|
||||||
|
None, input_feed={'xs': orig_hs[0]})
|
||||||
|
am_postnet_output = am_postnet_sess.run(
|
||||||
|
None,
|
||||||
|
input_feed={
|
||||||
|
'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
|
||||||
|
})
|
||||||
|
am_output_data = am_decoder_output + np.transpose(
|
||||||
|
am_postnet_output[0], (0, 2, 1))
|
||||||
|
normalized_mel = am_output_data[0]
|
||||||
|
mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
mel = mel[0]
|
||||||
|
# vocoder
|
||||||
|
|
||||||
|
wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
|
||||||
|
|
||||||
|
N += len(wav[0])
|
||||||
|
T += t.elapse
|
||||||
|
speed = len(wav[0]) / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
sf.write(
|
||||||
|
str(output_dir / (utt_id + ".wav")),
|
||||||
|
np.array(wav)[0],
|
||||||
|
samplerate=fs)
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
|
||||||
|
# acoustic model
|
||||||
|
parser.add_argument(
|
||||||
|
'--am',
|
||||||
|
type=str,
|
||||||
|
default='fastspeech2_csmsc',
|
||||||
|
choices=['fastspeech2_csmsc'],
|
||||||
|
help='Choose acoustic model type of tts task.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--am_stat",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training acoustic model."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
|
||||||
|
|
||||||
|
# voc
|
||||||
|
parser.add_argument(
|
||||||
|
'--voc',
|
||||||
|
type=str,
|
||||||
|
default='hifigan_csmsc',
|
||||||
|
choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
|
||||||
|
help='Choose vocoder type of tts task.')
|
||||||
|
# other
|
||||||
|
parser.add_argument(
|
||||||
|
"--inference_dir", type=str, help="dir to save inference models")
|
||||||
|
parser.add_argument(
|
||||||
|
"--text",
|
||||||
|
type=str,
|
||||||
|
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||||
|
parser.add_argument("--output_dir", type=str, help="output dir")
|
||||||
|
parser.add_argument(
|
||||||
|
'--lang',
|
||||||
|
type=str,
|
||||||
|
default='zh',
|
||||||
|
help='Choose model language. zh or en')
|
||||||
|
|
||||||
|
# inference
|
||||||
|
parser.add_argument(
|
||||||
|
"--use_trt",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="Whether to use inference engin TensorRT.", )
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
default="gpu",
|
||||||
|
choices=["gpu", "cpu"],
|
||||||
|
help="Device selected for inference.", )
|
||||||
|
parser.add_argument('--cpu_threads', type=int, default=1)
|
||||||
|
|
||||||
|
# streaming related
|
||||||
|
parser.add_argument(
|
||||||
|
"--am_streaming",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="whether use streaming acoustic model")
|
||||||
|
parser.add_argument(
|
||||||
|
"--chunk_size", type=int, default=42, help="chunk size of am streaming")
|
||||||
|
parser.add_argument(
|
||||||
|
"--pad_size", type=int, default=12, help="pad size of am streaming")
|
||||||
|
|
||||||
|
args, _ = parser.parse_known_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
ort_predict(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,575 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle and SpeechBrain Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""A popular speaker recognition/diarization model (LDA and PLDA).
|
||||||
|
|
||||||
|
Relevant Papers
|
||||||
|
- This implementation of PLDA is based on the following papers.
|
||||||
|
|
||||||
|
- PLDA model Training
|
||||||
|
* Ye Jiang et. al, "PLDA Modeling in I-Vector and Supervector Space for Speaker Verification," in Interspeech, 2012.
|
||||||
|
* Patrick Kenny et. al, "PLDA for speaker verification with utterances of arbitrary duration," in ICASSP, 2013.
|
||||||
|
|
||||||
|
- PLDA scoring (fast scoring)
|
||||||
|
* Daniel Garcia-Romero et. al, “Analysis of i-vector length normalization in speaker recognition systems,” in Interspeech, 2011.
|
||||||
|
* Weiwei-LIN et. al, "Fast Scoring for PLDA with Uncertainty Propagation," in Odyssey, 2016.
|
||||||
|
* Kong Aik Lee et. al, "Multi-session PLDA Scoring of I-vector for Partially Open-Set Speaker Detection," in Interspeech 2013.
|
||||||
|
|
||||||
|
Credits
|
||||||
|
This code is adapted from: https://git-lium.univ-lemans.fr/Larcher/sidekit
|
||||||
|
"""
|
||||||
|
import copy
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from scipy import linalg
|
||||||
|
|
||||||
|
from paddlespeech.vector.cluster.diarization import EmbeddingMeta
|
||||||
|
|
||||||
|
|
||||||
|
def ismember(list1, list2):
|
||||||
|
c = [item in list2 for item in list1]
|
||||||
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
class Ndx:
|
||||||
|
"""
|
||||||
|
A class that encodes trial index information. It has a list of
|
||||||
|
model names and a list of test segment names and a matrix
|
||||||
|
indicating which combinations of model and test segment are
|
||||||
|
trials of interest.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
modelset : list
|
||||||
|
List of unique models in a ndarray.
|
||||||
|
segset : list
|
||||||
|
List of unique test segments in a ndarray.
|
||||||
|
trialmask : 2D ndarray of bool.
|
||||||
|
Rows correspond to the models and columns to the test segments. True, if the trial is of interest.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
ndx_file_name="",
|
||||||
|
models=numpy.array([]),
|
||||||
|
testsegs=numpy.array([])):
|
||||||
|
"""
|
||||||
|
Initialize a Ndx object by loading information from a file.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
ndx_file_name : str
|
||||||
|
Name of the file to load.
|
||||||
|
"""
|
||||||
|
self.modelset = numpy.empty(0, dtype="|O")
|
||||||
|
self.segset = numpy.empty(0, dtype="|O")
|
||||||
|
self.trialmask = numpy.array([], dtype="bool")
|
||||||
|
|
||||||
|
if ndx_file_name == "":
|
||||||
|
# This is needed to make sizes same
|
||||||
|
d = models.shape[0] - testsegs.shape[0]
|
||||||
|
if d != 0:
|
||||||
|
if d > 0:
|
||||||
|
last = str(testsegs[-1])
|
||||||
|
pad = numpy.array([last] * d)
|
||||||
|
testsegs = numpy.hstack((testsegs, pad))
|
||||||
|
# pad = testsegs[-d:]
|
||||||
|
# testsegs = numpy.concatenate((testsegs, pad), axis=1)
|
||||||
|
else:
|
||||||
|
d = abs(d)
|
||||||
|
last = str(models[-1])
|
||||||
|
pad = numpy.array([last] * d)
|
||||||
|
models = numpy.hstack((models, pad))
|
||||||
|
# pad = models[-d:]
|
||||||
|
# models = numpy.concatenate((models, pad), axis=1)
|
||||||
|
|
||||||
|
modelset = numpy.unique(models)
|
||||||
|
segset = numpy.unique(testsegs)
|
||||||
|
|
||||||
|
trialmask = numpy.zeros(
|
||||||
|
(modelset.shape[0], segset.shape[0]), dtype="bool")
|
||||||
|
for m in range(modelset.shape[0]):
|
||||||
|
segs = testsegs[numpy.array(ismember(models, modelset[m]))]
|
||||||
|
trialmask[m, ] = ismember(segset, segs) # noqa E231
|
||||||
|
|
||||||
|
self.modelset = modelset
|
||||||
|
self.segset = segset
|
||||||
|
self.trialmask = trialmask
|
||||||
|
assert self.validate(), "Wrong Ndx format"
|
||||||
|
|
||||||
|
else:
|
||||||
|
ndx = Ndx.read(ndx_file_name)
|
||||||
|
self.modelset = ndx.modelset
|
||||||
|
self.segset = ndx.segset
|
||||||
|
self.trialmask = ndx.trialmask
|
||||||
|
|
||||||
|
def save_ndx_object(self, output_file_name):
|
||||||
|
with open(output_file_name, "wb") as output:
|
||||||
|
pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
def filter(self, modlist, seglist, keep):
|
||||||
|
"""
|
||||||
|
Removes some of the information in an Ndx. Useful for creating a
|
||||||
|
gender specific Ndx from a pooled gender Ndx. Depending on the
|
||||||
|
value of \'keep\', the two input lists indicate the strings to
|
||||||
|
retain or the strings to discard.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
modlist : array
|
||||||
|
A cell array of strings which will be compared with the modelset of 'inndx'.
|
||||||
|
seglist : array
|
||||||
|
A cell array of strings which will be compared with the segset of 'inndx'.
|
||||||
|
keep : bool
|
||||||
|
Indicating whether modlist and seglist are the models to keep or discard.
|
||||||
|
"""
|
||||||
|
if keep:
|
||||||
|
keepmods = modlist
|
||||||
|
keepsegs = seglist
|
||||||
|
else:
|
||||||
|
keepmods = diff(self.modelset, modlist)
|
||||||
|
keepsegs = diff(self.segset, seglist)
|
||||||
|
|
||||||
|
keepmodidx = numpy.array(ismember(self.modelset, keepmods))
|
||||||
|
keepsegidx = numpy.array(ismember(self.segset, keepsegs))
|
||||||
|
|
||||||
|
outndx = Ndx()
|
||||||
|
outndx.modelset = self.modelset[keepmodidx]
|
||||||
|
outndx.segset = self.segset[keepsegidx]
|
||||||
|
tmp = self.trialmask[numpy.array(keepmodidx), :]
|
||||||
|
outndx.trialmask = tmp[:, numpy.array(keepsegidx)]
|
||||||
|
|
||||||
|
assert outndx.validate, "Wrong Ndx format"
|
||||||
|
|
||||||
|
if self.modelset.shape[0] > outndx.modelset.shape[0]:
|
||||||
|
print(
|
||||||
|
"Number of models reduced from %d to %d" %
|
||||||
|
self.modelset.shape[0],
|
||||||
|
outndx.modelset.shape[0], )
|
||||||
|
if self.segset.shape[0] > outndx.segset.shape[0]:
|
||||||
|
print(
|
||||||
|
"Number of test segments reduced from %d to %d",
|
||||||
|
self.segset.shape[0],
|
||||||
|
outndx.segset.shape[0], )
|
||||||
|
return outndx
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
"""
|
||||||
|
Checks that an object of type Ndx obeys certain rules that
|
||||||
|
must always be true. Returns a boolean value indicating whether the object is valid
|
||||||
|
"""
|
||||||
|
ok = isinstance(self.modelset, numpy.ndarray)
|
||||||
|
ok &= isinstance(self.segset, numpy.ndarray)
|
||||||
|
ok &= isinstance(self.trialmask, numpy.ndarray)
|
||||||
|
|
||||||
|
ok &= self.modelset.ndim == 1
|
||||||
|
ok &= self.segset.ndim == 1
|
||||||
|
ok &= self.trialmask.ndim == 2
|
||||||
|
|
||||||
|
ok &= self.trialmask.shape == (self.modelset.shape[0],
|
||||||
|
self.segset.shape[0], )
|
||||||
|
return ok
|
||||||
|
|
||||||
|
|
||||||
|
class Scores:
|
||||||
|
"""
|
||||||
|
A class for storing scores for trials. The modelset and segset
|
||||||
|
fields are lists of model and test segment names respectively.
|
||||||
|
The element i,j of scoremat and scoremask corresponds to the
|
||||||
|
trial involving model i and test segment j.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
modelset : list
|
||||||
|
List of unique models in a ndarray.
|
||||||
|
segset : list
|
||||||
|
List of unique test segments in a ndarray.
|
||||||
|
scoremask : 2D ndarray of bool
|
||||||
|
Indicates the trials of interest, i.e.,
|
||||||
|
the entry i,j in scoremat should be ignored if scoremask[i,j] is False.
|
||||||
|
scoremat : 2D ndarray
|
||||||
|
Scores matrix.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, scores_file_name=""):
|
||||||
|
"""
|
||||||
|
Initialize a Scores object by loading information from a file HDF5 format.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
scores_file_name : str
|
||||||
|
Name of the file to load.
|
||||||
|
"""
|
||||||
|
self.modelset = numpy.empty(0, dtype="|O")
|
||||||
|
self.segset = numpy.empty(0, dtype="|O")
|
||||||
|
self.scoremask = numpy.array([], dtype="bool")
|
||||||
|
self.scoremat = numpy.array([])
|
||||||
|
|
||||||
|
if scores_file_name == "":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
tmp = Scores.read(scores_file_name)
|
||||||
|
self.modelset = tmp.modelset
|
||||||
|
self.segset = tmp.segset
|
||||||
|
self.scoremask = tmp.scoremask
|
||||||
|
self.scoremat = tmp.scoremat
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
ch = "modelset:\n"
|
||||||
|
ch += self.modelset + "\n"
|
||||||
|
ch += "segset:\n"
|
||||||
|
ch += self.segset + "\n"
|
||||||
|
ch += "scoremask:\n"
|
||||||
|
ch += self.scoremask.__repr__() + "\n"
|
||||||
|
ch += "scoremat:\n"
|
||||||
|
ch += self.scoremat.__repr__() + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def fa_model_loop(
|
||||||
|
batch_start,
|
||||||
|
mini_batch_indices,
|
||||||
|
factor_analyser,
|
||||||
|
stat0,
|
||||||
|
stats,
|
||||||
|
e_h,
|
||||||
|
e_hh, ):
|
||||||
|
"""
|
||||||
|
A function for PLDA estimation.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
batch_start : int
|
||||||
|
Index to start at in the list.
|
||||||
|
mini_batch_indices : list
|
||||||
|
Indices of the elements in the list (should start at zero).
|
||||||
|
factor_analyser : instance of PLDA class
|
||||||
|
PLDA class object.
|
||||||
|
stat0 : tensor
|
||||||
|
Matrix of zero-order statistics.
|
||||||
|
stats: tensor
|
||||||
|
Matrix of first-order statistics.
|
||||||
|
e_h : tensor
|
||||||
|
An accumulator matrix.
|
||||||
|
e_hh: tensor
|
||||||
|
An accumulator matrix.
|
||||||
|
"""
|
||||||
|
rank = factor_analyser.F.shape[1]
|
||||||
|
if factor_analyser.Sigma.ndim == 2:
|
||||||
|
A = factor_analyser.F.T.dot(factor_analyser.F)
|
||||||
|
inv_lambda_unique = dict()
|
||||||
|
for sess in numpy.unique(stat0[:, 0]):
|
||||||
|
inv_lambda_unique[sess] = linalg.inv(sess * A + numpy.eye(A.shape[
|
||||||
|
0]))
|
||||||
|
|
||||||
|
tmp = numpy.zeros(
|
||||||
|
(factor_analyser.F.shape[1], factor_analyser.F.shape[1]),
|
||||||
|
dtype=numpy.float64, )
|
||||||
|
|
||||||
|
for idx in mini_batch_indices:
|
||||||
|
if factor_analyser.Sigma.ndim == 1:
|
||||||
|
inv_lambda = linalg.inv(
|
||||||
|
numpy.eye(rank) + (factor_analyser.F.T * stat0[
|
||||||
|
idx + batch_start, :]).dot(factor_analyser.F))
|
||||||
|
else:
|
||||||
|
inv_lambda = inv_lambda_unique[stat0[idx + batch_start, 0]]
|
||||||
|
|
||||||
|
aux = factor_analyser.F.T.dot(stats[idx + batch_start, :])
|
||||||
|
numpy.dot(aux, inv_lambda, out=e_h[idx])
|
||||||
|
e_hh[idx] = inv_lambda + numpy.outer(e_h[idx], e_h[idx], tmp)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_missing_model(enroll, test, ndx):
|
||||||
|
# Remove missing models and test segments
|
||||||
|
clean_ndx = ndx.filter(enroll.modelset, test.segset, True)
|
||||||
|
|
||||||
|
# Align EmbeddingMeta to match the clean_ndx
|
||||||
|
enroll.align_models(clean_ndx.modelset)
|
||||||
|
test.align_segments(clean_ndx.segset)
|
||||||
|
|
||||||
|
return clean_ndx
|
||||||
|
|
||||||
|
|
||||||
|
class PLDA:
|
||||||
|
"""
|
||||||
|
A class to train PLDA model from embeddings.
|
||||||
|
|
||||||
|
The input is in paddlespeech.vector.cluster.diarization.EmbeddingMeta format.
|
||||||
|
Trains a simplified PLDA model no within-class covariance matrix but full residual covariance matrix.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
mean : tensor
|
||||||
|
Mean of the vectors.
|
||||||
|
F : tensor
|
||||||
|
Eigenvoice matrix.
|
||||||
|
Sigma : tensor
|
||||||
|
Residual matrix.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
mean=None,
|
||||||
|
F=None,
|
||||||
|
Sigma=None,
|
||||||
|
rank_f=100,
|
||||||
|
nb_iter=10,
|
||||||
|
scaling_factor=1.0, ):
|
||||||
|
self.mean = None
|
||||||
|
self.F = None
|
||||||
|
self.Sigma = None
|
||||||
|
self.rank_f = rank_f
|
||||||
|
self.nb_iter = nb_iter
|
||||||
|
self.scaling_factor = scaling_factor
|
||||||
|
|
||||||
|
if mean is not None:
|
||||||
|
self.mean = mean
|
||||||
|
if F is not None:
|
||||||
|
self.F = F
|
||||||
|
if Sigma is not None:
|
||||||
|
self.Sigma = Sigma
|
||||||
|
|
||||||
|
def plda(
|
||||||
|
self,
|
||||||
|
emb_meta=None,
|
||||||
|
output_file_name=None, ):
|
||||||
|
"""
|
||||||
|
Trains PLDA model with no within class covariance matrix but full residual covariance matrix.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
emb_meta : paddlespeech.vector.cluster.diarization.EmbeddingMeta
|
||||||
|
Contains vectors and meta-information to perform PLDA
|
||||||
|
rank_f : int
|
||||||
|
Rank of the between-class covariance matrix.
|
||||||
|
nb_iter : int
|
||||||
|
Number of iterations to run.
|
||||||
|
scaling_factor : float
|
||||||
|
Scaling factor to downscale statistics (value between 0 and 1).
|
||||||
|
output_file_name : str
|
||||||
|
Name of the output file where to store PLDA model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Dimension of the vector (x-vectors stored in stats)
|
||||||
|
vect_size = emb_meta.stats.shape[1]
|
||||||
|
|
||||||
|
# Initialize mean and residual covariance from the training data
|
||||||
|
self.mean = emb_meta.get_mean_stats()
|
||||||
|
self.Sigma = emb_meta.get_total_covariance_stats()
|
||||||
|
|
||||||
|
# Sum stat0 and stat1 for each speaker model
|
||||||
|
model_shifted_stat, session_per_model = emb_meta.sum_stat_per_model()
|
||||||
|
|
||||||
|
# Number of speakers (classes) in training set
|
||||||
|
class_nb = model_shifted_stat.modelset.shape[0]
|
||||||
|
|
||||||
|
# Multiply statistics by scaling_factor
|
||||||
|
model_shifted_stat.stat0 *= self.scaling_factor
|
||||||
|
model_shifted_stat.stats *= self.scaling_factor
|
||||||
|
session_per_model *= self.scaling_factor
|
||||||
|
|
||||||
|
# Covariance for stats
|
||||||
|
sigma_obs = emb_meta.get_total_covariance_stats()
|
||||||
|
evals, evecs = linalg.eigh(sigma_obs)
|
||||||
|
|
||||||
|
# Initial F (eigen voice matrix) from rank
|
||||||
|
idx = numpy.argsort(evals)[::-1]
|
||||||
|
evecs = evecs.real[:, idx[:self.rank_f]]
|
||||||
|
self.F = evecs[:, :self.rank_f]
|
||||||
|
|
||||||
|
# Estimate PLDA model by iterating the EM algorithm
|
||||||
|
for it in range(self.nb_iter):
|
||||||
|
|
||||||
|
# E-step
|
||||||
|
|
||||||
|
# Copy stats as they will be whitened with a different Sigma for each iteration
|
||||||
|
local_stat = copy.deepcopy(model_shifted_stat)
|
||||||
|
|
||||||
|
# Whiten statistics (with the new mean and Sigma)
|
||||||
|
local_stat.whiten_stats(self.mean, self.Sigma)
|
||||||
|
|
||||||
|
# Whiten the EigenVoice matrix
|
||||||
|
eigen_values, eigen_vectors = linalg.eigh(self.Sigma)
|
||||||
|
ind = eigen_values.real.argsort()[::-1]
|
||||||
|
eigen_values = eigen_values.real[ind]
|
||||||
|
eigen_vectors = eigen_vectors.real[:, ind]
|
||||||
|
sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real)
|
||||||
|
sqr_inv_sigma = numpy.dot(eigen_vectors,
|
||||||
|
numpy.diag(sqr_inv_eval_sigma))
|
||||||
|
self.F = sqr_inv_sigma.T.dot(self.F)
|
||||||
|
|
||||||
|
# Replicate self.stat0
|
||||||
|
index_map = numpy.zeros(vect_size, dtype=int)
|
||||||
|
_stat0 = local_stat.stat0[:, index_map]
|
||||||
|
|
||||||
|
e_h = numpy.zeros((class_nb, self.rank_f))
|
||||||
|
e_hh = numpy.zeros((class_nb, self.rank_f, self.rank_f))
|
||||||
|
|
||||||
|
# loop on model id's
|
||||||
|
fa_model_loop(
|
||||||
|
batch_start=0,
|
||||||
|
mini_batch_indices=numpy.arange(class_nb),
|
||||||
|
factor_analyser=self,
|
||||||
|
stat0=_stat0,
|
||||||
|
stats=local_stat.stats,
|
||||||
|
e_h=e_h,
|
||||||
|
e_hh=e_hh, )
|
||||||
|
|
||||||
|
# Accumulate for minimum divergence step
|
||||||
|
_R = numpy.sum(e_hh, axis=0) / session_per_model.shape[0]
|
||||||
|
|
||||||
|
_C = e_h.T.dot(local_stat.stats).dot(linalg.inv(sqr_inv_sigma))
|
||||||
|
_A = numpy.einsum("ijk,i->jk", e_hh, local_stat.stat0.squeeze())
|
||||||
|
|
||||||
|
# M-step
|
||||||
|
self.F = linalg.solve(_A, _C).T
|
||||||
|
|
||||||
|
# Update the residual covariance
|
||||||
|
self.Sigma = sigma_obs - self.F.dot(_C) / session_per_model.sum()
|
||||||
|
|
||||||
|
# Minimum Divergence step
|
||||||
|
self.F = self.F.dot(linalg.cholesky(_R))
|
||||||
|
|
||||||
|
def scoring(
|
||||||
|
self,
|
||||||
|
enroll,
|
||||||
|
test,
|
||||||
|
ndx,
|
||||||
|
test_uncertainty=None,
|
||||||
|
Vtrans=None,
|
||||||
|
p_known=0.0,
|
||||||
|
scaling_factor=1.0,
|
||||||
|
check_missing=True, ):
|
||||||
|
"""
|
||||||
|
Compute the PLDA scores between to sets of vectors. The list of
|
||||||
|
trials to perform is given in an Ndx object. PLDA matrices have to be
|
||||||
|
pre-computed. i-vectors/x-vectors are supposed to be whitened before.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
enroll : paddlespeech.vector.cluster.diarization.EmbeddingMeta
|
||||||
|
A EmbeddingMeta in which stats are xvectors.
|
||||||
|
test : paddlespeech.vector.cluster.diarization.EmbeddingMeta
|
||||||
|
A EmbeddingMeta in which stats are xvectors.
|
||||||
|
ndx : paddlespeech.vector.cluster.plda.Ndx
|
||||||
|
An Ndx object defining the list of trials to perform.
|
||||||
|
p_known : float
|
||||||
|
Probability of having a known speaker for open-set
|
||||||
|
identification case (=1 for the verification task and =0 for the
|
||||||
|
closed-set case).
|
||||||
|
check_missing : bool
|
||||||
|
If True, check that all models and segments exist.
|
||||||
|
"""
|
||||||
|
|
||||||
|
enroll_ctr = copy.deepcopy(enroll)
|
||||||
|
test_ctr = copy.deepcopy(test)
|
||||||
|
|
||||||
|
# Remove missing models and test segments
|
||||||
|
if check_missing:
|
||||||
|
clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx)
|
||||||
|
else:
|
||||||
|
clean_ndx = ndx
|
||||||
|
|
||||||
|
# Center the i-vectors around the PLDA mean
|
||||||
|
enroll_ctr.center_stats(self.mean)
|
||||||
|
test_ctr.center_stats(self.mean)
|
||||||
|
|
||||||
|
# Compute constant component of the PLDA distribution
|
||||||
|
invSigma = linalg.inv(self.Sigma)
|
||||||
|
I_spk = numpy.eye(self.F.shape[1], dtype="float")
|
||||||
|
|
||||||
|
K = self.F.T.dot(invSigma * scaling_factor).dot(self.F)
|
||||||
|
K1 = linalg.inv(K + I_spk)
|
||||||
|
K2 = linalg.inv(2 * K + I_spk)
|
||||||
|
|
||||||
|
# Compute the Gaussian distribution constant
|
||||||
|
alpha1 = numpy.linalg.slogdet(K1)[1]
|
||||||
|
alpha2 = numpy.linalg.slogdet(K2)[1]
|
||||||
|
plda_cst = alpha2 / 2.0 - alpha1
|
||||||
|
|
||||||
|
# Compute intermediate matrices
|
||||||
|
Sigma_ac = numpy.dot(self.F, self.F.T)
|
||||||
|
Sigma_tot = Sigma_ac + self.Sigma
|
||||||
|
Sigma_tot_inv = linalg.inv(Sigma_tot)
|
||||||
|
|
||||||
|
Tmp = linalg.inv(Sigma_tot - Sigma_ac.dot(Sigma_tot_inv).dot(Sigma_ac))
|
||||||
|
Phi = Sigma_tot_inv - Tmp
|
||||||
|
Psi = Sigma_tot_inv.dot(Sigma_ac).dot(Tmp)
|
||||||
|
|
||||||
|
# Compute the different parts of PLDA score
|
||||||
|
model_part = 0.5 * numpy.einsum("ij, ji->i",
|
||||||
|
enroll_ctr.stats.dot(Phi),
|
||||||
|
enroll_ctr.stats.T)
|
||||||
|
seg_part = 0.5 * numpy.einsum("ij, ji->i",
|
||||||
|
test_ctr.stats.dot(Phi), test_ctr.stats.T)
|
||||||
|
|
||||||
|
# Compute verification scores
|
||||||
|
score = Scores() # noqa F821
|
||||||
|
score.modelset = clean_ndx.modelset
|
||||||
|
score.segset = clean_ndx.segset
|
||||||
|
score.scoremask = clean_ndx.trialmask
|
||||||
|
|
||||||
|
score.scoremat = model_part[:, numpy.newaxis] + seg_part + plda_cst
|
||||||
|
score.scoremat += enroll_ctr.stats.dot(Psi).dot(test_ctr.stats.T)
|
||||||
|
score.scoremat *= scaling_factor
|
||||||
|
|
||||||
|
# Case of open-set identification, we compute the log-likelihood
|
||||||
|
# by taking into account the probability of having a known impostor
|
||||||
|
# or an out-of set class
|
||||||
|
if p_known != 0:
|
||||||
|
N = score.scoremat.shape[0]
|
||||||
|
open_set_scores = numpy.empty(score.scoremat.shape)
|
||||||
|
tmp = numpy.exp(score.scoremat)
|
||||||
|
for ii in range(N):
|
||||||
|
# open-set term
|
||||||
|
open_set_scores[ii, :] = score.scoremat[ii, :] - numpy.log(
|
||||||
|
p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (
|
||||||
|
N - 1) + (1 - p_known))
|
||||||
|
score.scoremat = open_set_scores
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import random
|
||||||
|
|
||||||
|
dim, N, n_spkrs = 10, 100, 10
|
||||||
|
train_xv = numpy.random.rand(N, dim)
|
||||||
|
md = ['md' + str(random.randrange(1, n_spkrs, 1)) for i in range(N)] # spk
|
||||||
|
modelset = numpy.array(md, dtype="|O")
|
||||||
|
sg = ['sg' + str(i) for i in range(N)] # utt
|
||||||
|
segset = numpy.array(sg, dtype="|O")
|
||||||
|
stat0 = numpy.array([[1.0]] * N)
|
||||||
|
xvectors_stat = EmbeddingMeta(
|
||||||
|
modelset=modelset, segset=segset, stats=train_xv)
|
||||||
|
# Training PLDA model: M ~ (mean, F, Sigma)
|
||||||
|
plda = PLDA(rank_f=5)
|
||||||
|
plda.plda(xvectors_stat)
|
||||||
|
print(plda.mean.shape) #(10,)
|
||||||
|
print(plda.F.shape) #(10, 5)
|
||||||
|
print(plda.Sigma.shape) #(10, 10)
|
||||||
|
# Enrollment (20 utts),
|
||||||
|
en_N = 20
|
||||||
|
en_xv = numpy.random.rand(en_N, dim)
|
||||||
|
en_sgs = ['en' + str(i) for i in range(en_N)]
|
||||||
|
en_sets = numpy.array(en_sgs, dtype="|O")
|
||||||
|
en_stat = EmbeddingMeta(modelset=en_sets, segset=en_sets, stats=en_xv)
|
||||||
|
# Test (30 utts)
|
||||||
|
te_N = 30
|
||||||
|
te_xv = numpy.random.rand(te_N, dim)
|
||||||
|
te_sgs = ['te' + str(i) for i in range(te_N)]
|
||||||
|
te_sets = numpy.array(te_sgs, dtype="|O")
|
||||||
|
te_stat = EmbeddingMeta(modelset=te_sets, segset=te_sets, stats=te_xv)
|
||||||
|
ndx = Ndx(models=en_sets, testsegs=te_sets) # trials
|
||||||
|
# PLDA Scoring
|
||||||
|
scores_plda = plda.scoring(en_stat, te_stat, ndx)
|
||||||
|
print(scores_plda.scoremat.shape) #(20, 30)
|
@ -1 +0,0 @@
|
|||||||
# NGram Train
|
|
@ -0,0 +1,101 @@
|
|||||||
|
# ngram train for mandarin
|
||||||
|
|
||||||
|
Quick run:
|
||||||
|
```
|
||||||
|
bash run.sh --stage -1
|
||||||
|
```
|
||||||
|
|
||||||
|
## input
|
||||||
|
|
||||||
|
input files:
|
||||||
|
```
|
||||||
|
data/
|
||||||
|
├── lexicon.txt
|
||||||
|
├── text
|
||||||
|
└── vocab.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
==> data/text <==
|
||||||
|
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
|
||||||
|
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
|
||||||
|
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
|
||||||
|
BAC009S0002W0125 各地 政府 便 纷纷 跟进
|
||||||
|
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
|
||||||
|
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
|
||||||
|
BAC009S0002W0128 四十六 个 限 购 城市 当中
|
||||||
|
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
|
||||||
|
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
|
||||||
|
BAC009S0002W0131 显示 出 了 极 强 的 威力
|
||||||
|
|
||||||
|
==> data/lexicon.txt <==
|
||||||
|
SIL sil
|
||||||
|
<SPOKEN_NOISE> sil
|
||||||
|
啊 aa a1
|
||||||
|
啊 aa a2
|
||||||
|
啊 aa a4
|
||||||
|
啊 aa a5
|
||||||
|
啊啊啊 aa a2 aa a2 aa a2
|
||||||
|
啊啊啊 aa a5 aa a5 aa a5
|
||||||
|
坐地 z uo4 d i4
|
||||||
|
坐实 z uo4 sh ix2
|
||||||
|
坐视 z uo4 sh ix4
|
||||||
|
坐稳 z uo4 uu un3
|
||||||
|
坐拥 z uo4 ii iong1
|
||||||
|
坐诊 z uo4 zh en3
|
||||||
|
坐庄 z uo4 zh uang1
|
||||||
|
坐姿 z uo4 z iy1
|
||||||
|
|
||||||
|
==> data/vocab.txt <==
|
||||||
|
<blank>
|
||||||
|
<unk>
|
||||||
|
A
|
||||||
|
B
|
||||||
|
C
|
||||||
|
D
|
||||||
|
E
|
||||||
|
龙
|
||||||
|
龚
|
||||||
|
龛
|
||||||
|
<eos>
|
||||||
|
```
|
||||||
|
|
||||||
|
## output
|
||||||
|
|
||||||
|
```
|
||||||
|
data/
|
||||||
|
├── local
|
||||||
|
│ ├── dict
|
||||||
|
│ │ ├── lexicon.txt
|
||||||
|
│ │ └── units.txt
|
||||||
|
│ └── lm
|
||||||
|
│ ├── heldout
|
||||||
|
│ ├── lm.arpa
|
||||||
|
│ ├── text
|
||||||
|
│ ├── text.no_oov
|
||||||
|
│ ├── train
|
||||||
|
│ ├── unigram.counts
|
||||||
|
│ ├── word.counts
|
||||||
|
│ └── wordlist
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
/workspace/srilm/bin/i686-m64/ngram-count
|
||||||
|
Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
|
||||||
|
Ignoring words 矽, which contains oov unit
|
||||||
|
Ignoring words 傩, which contains oov unit
|
||||||
|
Ignoring words 堀, which contains oov unit
|
||||||
|
Ignoring words 莼, which contains oov unit
|
||||||
|
Ignoring words 菰, which contains oov unit
|
||||||
|
Ignoring words 摭, which contains oov unit
|
||||||
|
Ignoring words 帙, which contains oov unit
|
||||||
|
Ignoring words 迨, which contains oov unit
|
||||||
|
Ignoring words 孥, which contains oov unit
|
||||||
|
Ignoring words 瑗, which contains oov unit
|
||||||
|
...
|
||||||
|
...
|
||||||
|
...
|
||||||
|
file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
|
||||||
|
0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
|
||||||
|
build LM done.
|
||||||
|
```
|
@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# To be run from one directory above this script.
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
text=data/local/lm/text
|
||||||
|
lexicon=data/local/dict/lexicon.txt
|
||||||
|
|
||||||
|
for f in "$text" "$lexicon"; do
|
||||||
|
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check SRILM tools
|
||||||
|
if ! which ngram-count > /dev/null; then
|
||||||
|
echo "srilm tools are not found, please download it and install it from: "
|
||||||
|
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||||
|
echo "Then add the tools to your PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This script takes no arguments. It assumes you have already run
|
||||||
|
# aishell_data_prep.sh.
|
||||||
|
# It takes as input the files
|
||||||
|
# data/local/lm/text
|
||||||
|
# data/local/dict/lexicon.txt
|
||||||
|
dir=data/local/lm
|
||||||
|
mkdir -p $dir
|
||||||
|
|
||||||
|
cleantext=$dir/text.no_oov
|
||||||
|
|
||||||
|
# oov to <SPOKEN_NOISE>
|
||||||
|
# lexicon line: word char0 ... charn
|
||||||
|
# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
|
||||||
|
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||||
|
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||||
|
> $cleantext || exit 1;
|
||||||
|
|
||||||
|
# compute word counts, sort in descending order
|
||||||
|
# line: count word
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||||
|
sort -nr > $dir/word.counts || exit 1;
|
||||||
|
|
||||||
|
# Get counts from acoustic training transcripts, and add one-count
|
||||||
|
# for each word in the lexicon (but not silence, we don't want it
|
||||||
|
# in the LM-- we'll add it optionally later).
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||||
|
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||||
|
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||||
|
|
||||||
|
# word with <s> </s>
|
||||||
|
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||||
|
|
||||||
|
# hold out to compute ppl
|
||||||
|
heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
|
||||||
|
|
||||||
|
mkdir -p $dir
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
head -$heldout_sent > $dir/heldout
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
tail -n +$heldout_sent > $dir/train
|
||||||
|
|
||||||
|
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||||
|
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||||
|
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,12 @@
|
|||||||
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
|
MAIN_ROOT=`realpath $PWD/../../../../`
|
||||||
|
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||||
|
|
||||||
|
export LC_AL=C
|
||||||
|
|
||||||
|
# srilm
|
||||||
|
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||||
|
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||||
|
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
. path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
corpus=aishell
|
||||||
|
|
||||||
|
unit=data/vocab.txt # vocab file, line: char/spm_pice
|
||||||
|
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
|
||||||
|
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||||
|
|
||||||
|
. utils/parse_options.sh
|
||||||
|
|
||||||
|
data=$PWD/data
|
||||||
|
mkdir -p $data
|
||||||
|
|
||||||
|
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||||
|
if [ ! -f $data/speech.ngram.zh.tar.gz ];then
|
||||||
|
pushd $data
|
||||||
|
wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
|
||||||
|
tar xvzf speech.ngram.zh.tar.gz
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $unit ]; then
|
||||||
|
echo "$0: No such file $unit"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! which ngram-count; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make srilm.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p data/local/dict
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# 7.1 Prepare dict
|
||||||
|
# line: char/spm_pices
|
||||||
|
cp $unit data/local/dict/units.txt
|
||||||
|
|
||||||
|
if [ ! -f $lexicon ];then
|
||||||
|
local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
|
||||||
|
echo "Generate $lexicon from $text"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# filter by vocab
|
||||||
|
# line: word ph0 ... phn -> line: word char0 ... charn
|
||||||
|
utils/fst/prepare_dict.py \
|
||||||
|
--unit_file $unit \
|
||||||
|
--in_lexicon ${lexicon} \
|
||||||
|
--out_lexicon data/local/dict/lexicon.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
lm=data/local/lm
|
||||||
|
mkdir -p $lm
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# 7.2 Train lm
|
||||||
|
cp $text $lm/text
|
||||||
|
local/aishell_train_lms.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "build LM done."
|
||||||
|
exit 0
|
@ -0,0 +1 @@
|
|||||||
|
../../../../utils/
|
@ -0,0 +1 @@
|
|||||||
|
data
|
@ -0,0 +1,15 @@
|
|||||||
|
# Text PreProcess for building ngram LM
|
||||||
|
|
||||||
|
Output `text` file like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
|
||||||
|
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
|
||||||
|
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
|
||||||
|
BAC009S0002W0125 各地 政府 便 纷纷 跟进
|
||||||
|
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
|
||||||
|
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
|
||||||
|
BAC009S0002W0128 四十六 个 限 购 城市 当中
|
||||||
|
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
|
||||||
|
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
|
||||||
|
```
|
@ -0,0 +1,4 @@
|
|||||||
|
MAIN_ROOT=`realpath $PWD/../../../../`
|
||||||
|
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||||
|
|
||||||
|
export LC_AL=C
|
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
. path.sh
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
has_key=true
|
||||||
|
token_type=word
|
||||||
|
|
||||||
|
. utils/parse_options.sh || exit -1;
|
||||||
|
|
||||||
|
text=data/text
|
||||||
|
|
||||||
|
if [ ! -f $text ]; then
|
||||||
|
echo "$0: Not find $1";
|
||||||
|
exit -1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||||
|
echo "text tn & wordseg preprocess"
|
||||||
|
rm -rf ${text}.tn
|
||||||
|
python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
|
||||||
|
fi
|
@ -0,0 +1 @@
|
|||||||
|
../../../utils/
|
@ -0,0 +1 @@
|
|||||||
|
data
|
@ -0,0 +1,19 @@
|
|||||||
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
|
MAIN_ROOT=`realpath $PWD/../../../`
|
||||||
|
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||||
|
|
||||||
|
export LC_AL=C
|
||||||
|
|
||||||
|
# srilm
|
||||||
|
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||||
|
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||||
|
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
||||||
|
|
||||||
|
# Kaldi
|
||||||
|
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
|
||||||
|
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
|
||||||
|
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
|
||||||
|
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
|
||||||
|
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
|
@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
. path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
. utils/parse_options.sh
|
||||||
|
|
||||||
|
if ! which fstprint ; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make kaldi.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# build T & L
|
||||||
|
# utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
|
||||||
|
utils/fst/compile_lexicon_token_fst.sh \
|
||||||
|
data/local/dict data/local/tmp data/local/lang
|
||||||
|
|
||||||
|
# build G & LG & TLG
|
||||||
|
# utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
|
||||||
|
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "build TLG done."
|
||||||
|
exit 0
|
@ -0,0 +1 @@
|
|||||||
|
../../../utils/
|
@ -1,97 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
current_path=`pwd`
|
|
||||||
current_dir=`basename "$current_path"`
|
|
||||||
|
|
||||||
if [ "tools" != "$current_dir" ]; then
|
|
||||||
echo "You should run this script in tools/ directory!!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -d liblbfgs-1.10 ]; then
|
|
||||||
echo Installing libLBFGS library to support MaxEnt LMs
|
|
||||||
bash extras/install_liblbfgs.sh || exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
! command -v gawk > /dev/null && \
|
|
||||||
echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
|
|
||||||
|
|
||||||
if [ $# -ne 3 ]; then
|
|
||||||
echo "SRILM download requires some information about you"
|
|
||||||
echo
|
|
||||||
echo "Usage: $0 <name> <organization> <email>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
|
|
||||||
post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
|
|
||||||
|
|
||||||
if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
|
|
||||||
echo 'There was a problem downloading the file.'
|
|
||||||
echo 'Check you internet connection and try again.'
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p srilm
|
|
||||||
cd srilm
|
|
||||||
|
|
||||||
|
|
||||||
if [ -f ../srilm.tgz ]; then
|
|
||||||
tar -xvzf ../srilm.tgz # Old SRILM format
|
|
||||||
elif [ -f ../srilm.tar.gz ]; then
|
|
||||||
tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
major=`gawk -F. '{ print $1 }' RELEASE`
|
|
||||||
minor=`gawk -F. '{ print $2 }' RELEASE`
|
|
||||||
micro=`gawk -F. '{ print $3 }' RELEASE`
|
|
||||||
|
|
||||||
if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
|
|
||||||
echo "Detected version 1.7.1 or earlier. Applying patch."
|
|
||||||
patch -p0 < ../extras/srilm.patch
|
|
||||||
fi
|
|
||||||
|
|
||||||
# set the SRILM variable in the top-level Makefile to this directory.
|
|
||||||
cp Makefile tmpf
|
|
||||||
|
|
||||||
cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
|
|
||||||
> Makefile || exit 1
|
|
||||||
rm tmpf
|
|
||||||
|
|
||||||
mtype=`sbin/machine-type`
|
|
||||||
|
|
||||||
echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
|
|
||||||
grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
|
|
||||||
sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
|
|
||||||
>> common/Makefile.machine.$mtype
|
|
||||||
|
|
||||||
grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
|
|
||||||
sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
|
|
||||||
>> common/Makefile.machine.$mtype
|
|
||||||
|
|
||||||
make || exit
|
|
||||||
|
|
||||||
cd ..
|
|
||||||
(
|
|
||||||
[ ! -z "${SRILM}" ] && \
|
|
||||||
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
|
|
||||||
unset SRILM
|
|
||||||
|
|
||||||
[ -f ./env.sh ] && . ./env.sh
|
|
||||||
|
|
||||||
[ ! -z "${SRILM}" ] && \
|
|
||||||
echo >&2 "SRILM config is already in env.sh" && exit
|
|
||||||
|
|
||||||
wd=`pwd`
|
|
||||||
wd=`readlink -f $wd || pwd`
|
|
||||||
|
|
||||||
echo "export SRILM=$wd/srilm"
|
|
||||||
dirs="\${PATH}"
|
|
||||||
for directory in $(cd srilm && find bin -type d ) ; do
|
|
||||||
dirs="$dirs:\${SRILM}/$directory"
|
|
||||||
done
|
|
||||||
echo "export PATH=$dirs"
|
|
||||||
) >> env.sh
|
|
||||||
|
|
||||||
echo >&2 "Installation of SRILM finished successfully"
|
|
||||||
echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue