remove default cfg and fix some bugs,test=asr

pull/1225/head
huangyuxin 3 years ago
parent a1d8ab0f99
commit 3e2cc898cb

@ -1,47 +0,0 @@
#!/bin/bash
if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
model_type=$3
audio_file=$4
mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
if [ $? -ne 0 ]; then
exit 1
fi
if [ ! -f ${audio_file} ]; then
echo "Plase input the right audio_file path"
exit 1
fi
# download language model
bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then
exit 1
fi
python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \
--audio_file ${audio_file}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0

@ -15,7 +15,6 @@
from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -42,7 +41,7 @@ if __name__ == "__main__":
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html
config = get_cfg_defaults(args.model_type)
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer):
:rtype: tuple of LayerOutput
"""
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
default = CfgNode(
dict(
num_conv_layers=2, #Number of stacking convolution layers.
num_rnn_layers=3, #Number of stacking RNN layers.
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
use_gru=True, #Use gru if set True. Use simple rnn if set False.
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self,
feat_size,
dict_size,

@ -44,22 +44,6 @@ logger = Log(__name__).getlog()
class DeepSpeech2Trainer(Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# training config
default = CfgNode(
dict(
lr=5e-4, # learning rate
lr_decay=1.0, # learning rate decay
weight_decay=1e-6, # the coeff of weight decay
global_grad_clip=5.0, # the global norm clip
n_epoch=50, # train epochs
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
@ -246,27 +230,6 @@ class DeepSpeech2Trainer(Trainer):
class DeepSpeech2Tester(DeepSpeech2Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# testing config
default = CfgNode(
dict(
alpha=2.5, # Coef of LM for beam search.
beta=0.3, # Coef of WC for beam search.
cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=500, # Beam search width.
batch_size=128, # decoding batch size
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
self._text_featurizer = TextFeaturizer(

@ -0,0 +1,25 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: 0.1
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
preprocess_config: conf/preprocess.yaml
batch_size: 16
maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
@ -87,7 +87,7 @@ global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 1e-06
weight_decay: 1.0e-06
scheduler: noam
scheduler_conf:
warmup_steps: 25000

@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: data/lang_char/bpe_unigram_8000
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
preprocess_config: conf/preprocess.yaml
batch_size: 16
maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced

@ -0,0 +1,16 @@
process:
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
unit_type: 'spm'
spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
@ -27,7 +27,7 @@ batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config:
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1

@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
unit_type: 'spm'
spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
@ -27,7 +27,7 @@ batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config:
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1

@ -20,12 +20,7 @@ for type in fullsentence; do
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
<<<<<<< HEAD
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
=======
--opts decoding.decoding_method ${type} \
>>>>>>> 6272496d9c26736750b577fd832ea9dd4ddc4e6e
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"

@ -58,7 +58,6 @@ mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
@ -72,7 +71,7 @@ batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml
preprocess_config: conf/preprocess.yaml
num_workers: 0
subsampling_factor: 1
num_encs: 1

@ -85,7 +85,7 @@ def recog_v2(args):
mode="asr",
load_output=False,
sort_in_input_length=False,
preprocess_conf=confs.collator.augmentation_config
preprocess_conf=confs.preprocess_config
if args.preprocess_conf is None else args.preprocess_conf,
preprocess_args={"train": False}, )

@ -20,7 +20,7 @@ from paddle.inference import Config
from paddle.inference import create_predictor
from paddle.io import DataLoader
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
@ -176,7 +176,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -18,7 +18,7 @@ import numpy as np
import paddle
from paddle.io import DataLoader
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
@ -111,7 +111,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Export for DeepSpeech2 model."""
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -41,7 +41,7 @@ if __name__ == "__main__":
print_arguments(args)
# https://yaml.org/type/float.html
config = get_cfg_defaults(args.model_type)
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.opts:

@ -14,7 +14,6 @@
"""Evaluation for DeepSpeech2 model."""
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -43,7 +42,7 @@ if __name__ == "__main__":
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html
config = get_cfg_defaults(args.model_type)
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -13,8 +13,6 @@
# limitations under the License.
"""Evaluation for DeepSpeech2 model."""
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -48,7 +46,7 @@ if __name__ == "__main__":
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html
config = get_cfg_defaults(args.model_type)
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -188,7 +188,7 @@ if __name__ == "__main__":
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html
config = get_cfg_defaults(args.model_type)
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -14,7 +14,7 @@
"""Trainer for DeepSpeech2 model."""
from paddle import distributed as dist
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -42,7 +42,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults(args.model_type)
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.opts:

@ -1,28 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
def get_cfg_defaults(model_type='offline'):
_C = CfgNode()
config = _C.clone()
config.set_new_allowed(True)
return config

@ -49,22 +49,6 @@ logger = Log(__name__).getlog()
class DeepSpeech2Trainer(Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# training config
default = CfgNode(
dict(
lr=5e-4, # learning rate
lr_decay=1.0, # learning rate decay
weight_decay=1e-6, # the coeff of weight decay
global_grad_clip=5.0, # the global norm clip
n_epoch=50, # train epochs
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
@ -259,27 +243,6 @@ class DeepSpeech2Trainer(Trainer):
class DeepSpeech2Tester(DeepSpeech2Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# testing config
default = CfgNode(
dict(
alpha=2.5, # Coef of LM for beam search.
beta=0.3, # Coef of WC for beam search.
cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=500, # Beam search width.
batch_size=128, # decoding batch size
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
self._text_featurizer = TextFeaturizer(

@ -14,7 +14,6 @@
"""Alignment for U2 model."""
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -33,14 +32,14 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Export for U2 model."""
from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -31,14 +31,14 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save jit model to
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.opts:

@ -16,7 +16,6 @@ import cProfile
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -44,7 +43,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -20,7 +20,6 @@ import paddle
import soundfile
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.models.u2 import U2Model
from paddlespeech.s2t.training.cli import default_argument_parser
@ -129,7 +128,7 @@ if __name__ == "__main__":
"--audio_file", type=str, help="path of the input audio file")
args = parser.parse_args()
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -17,7 +17,7 @@ import os
from paddle import distributed as dist
from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -44,7 +44,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.opts:

@ -46,33 +46,6 @@ logger = Log(__name__).getlog()
class U2Trainer(Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# training config
default = CfgNode(
dict(
n_epoch=50, # train epochs
log_interval=100, # steps
accum_grad=1, # accum grad by # steps
global_grad_clip=5.0, # the global norm clip
))
default.optim = 'adam'
default.optim_conf = CfgNode(
dict(
lr=5e-4, # learning rate
weight_decay=1e-6, # the coeff of weight decay
))
default.scheduler = 'warmuplr'
default.scheduler_conf = CfgNode(
dict(
warmup_steps=25000,
lr_decay=1.0, # learning rate decay
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
@ -401,35 +374,6 @@ class U2Trainer(Trainer):
class U2Tester(U2Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# decoding config
default = CfgNode(
dict(
alpha=2.5, # Coef of LM for beam search.
beta=0.3, # Coef of WC for beam search.
cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search',
# 'ctc_prefix_beam_search', 'attention_rescoring'
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=10, # Beam search width.
decode_batch_size=16, # decoding batch size
ctc_weight=0.0, # ctc weight for attention rescoring decode mode.
decoding_chunk_size=-1, # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1.
simulate_streaming=False, # simulate streaming inference. Defaults to False.
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
self.text_feature = TextFeaturizer(

@ -55,7 +55,6 @@ class U2Trainer(Trainer):
collate_fn_train = SpeechCollator.from_config(config)
config.augmentation_config = ""
collate_fn_dev = SpeechCollator.from_config(config)
if self.parallel:
@ -103,7 +102,6 @@ class U2Trainer(Trainer):
test_dataset = ManifestDataset.from_config(config)
# return text ord id
config.keep_transcription_text = True
config.augmentation_config = ""
self.test_loader = DataLoader(
test_dataset,
batch_size=config.decode.batch_size,

@ -42,40 +42,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
logger = Log(__name__).getlog()
def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project."""
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
_C = CfgNode()
_C.model = U2Model.params()
_C.training = U2Trainer.params()
_C.decoding = U2Tester.params()
config = _C.clone()
config.set_new_allowed(True)
return config
class U2Trainer(Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# training config
default = CfgNode(
dict(
n_epoch=50, # train epochs
log_interval=100, # steps
accum_grad=1, # accum grad by # steps
checkpoint=dict(
kbest_n=50,
latest_n=5, ), ))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
@ -362,35 +329,6 @@ class U2Trainer(Trainer):
class U2Tester(U2Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# decoding config
default = CfgNode(
dict(
alpha=2.5, # Coef of LM for beam search.
beta=0.3, # Coef of WC for beam search.
cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search',
# 'ctc_prefix_beam_search', 'attention_rescoring'
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=10, # Beam search width.
batch_size=16, # decoding batch size
ctc_weight=0.0, # ctc weight for attention rescoring decode mode.
decoding_chunk_size=-1, # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1.
simulate_streaming=False, # simulate streaming inference. Defaults to False.
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
self.text_feature = TextFeaturizer(

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Export for U2 model."""
from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -31,14 +31,14 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save jit model to
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.opts:

@ -16,7 +16,6 @@ import cProfile
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -44,7 +43,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.decode_cfg:

@ -16,8 +16,8 @@ import cProfile
import os
from paddle import distributed as dist
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
@ -42,7 +42,7 @@ if __name__ == "__main__":
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
if args.opts:

@ -1,41 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTester
from paddlespeech.s2t.exps.u2_st.model import U2STTrainer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.models.u2_st import U2STModel
_C = CfgNode()
# _C.data = ManifestDataset.params()
# _C.collator = SpeechCollator.params()
# _C.model = U2STModel.params()
# _C.training = U2STTrainer.params()
# _C.decoding = U2STTester.params()
def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project."""
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
config = _C.clone()
config.set_new_allowed(True)
return config

@ -45,33 +45,6 @@ logger = Log(__name__).getlog()
class U2STTrainer(Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# training config
default = CfgNode(
dict(
n_epoch=50, # train epochs
log_interval=100, # steps
accum_grad=1, # accum grad by # steps
global_grad_clip=5.0, # the global norm clip
))
default.optim = 'adam'
default.optim_conf = CfgNode(
dict(
lr=5e-4, # learning rate
weight_decay=1e-6, # the coeff of weight decay
))
default.scheduler = 'warmuplr'
default.scheduler_conf = CfgNode(
dict(
warmup_steps=25000,
lr_decay=1.0, # learning rate decay
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
@ -127,7 +100,7 @@ class U2STTrainer(Trainer):
for k, v in losses_np.items():
report(k, v)
report("batch_size", self.config.collator.batch_size)
report("batch_size", self.config.batch_size)
report("accum", train_conf.accum_grad)
report("step_cost", iteration_time)
@ -236,7 +209,7 @@ class U2STTrainer(Trainer):
msg += ","
msg = msg[:-1] # remove the last ","
if (batch_index + 1
) % self.config.training.log_interval == 0:
) % self.config.log_interval == 0:
logger.info(msg)
except Exception as e:
logger.error(e)
@ -287,7 +260,7 @@ class U2STTrainer(Trainer):
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False
preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False
n_iter_processes=config.num_workers,
subsampling_factor=1,
load_aux_output=load_transcript,
@ -308,7 +281,7 @@ class U2STTrainer(Trainer):
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False
preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False
n_iter_processes=config.num_workers,
subsampling_factor=1,
load_aux_output=load_transcript,
@ -319,7 +292,7 @@ class U2STTrainer(Trainer):
# test dataset, return raw text
decode_batch_size = config.get('decode',dict()).get('decode_batch_size', 1)
self.test_loader = BatchDataLoader(
json_file=config.data.test_manifest,
json_file=config.test_manifest,
train_mode=False,
sortagrad=False,
batch_size=decode_batch_size,
@ -332,7 +305,7 @@ class U2STTrainer(Trainer):
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False
preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False
n_iter_processes=config.num_workers,
subsampling_factor=1,
num_encs=1,
@ -379,7 +352,7 @@ class U2STTrainer(Trainer):
config,
parameters,
lr_scheduler=None, ):
train_config = config.training
train_config = config
optim_type = train_config.optim
optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler
@ -405,41 +378,12 @@ class U2STTrainer(Trainer):
class U2STTester(U2STTrainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# decoding config
default = CfgNode(
dict(
alpha=2.5, # Coef of LM for beam search.
beta=0.3, # Coef of WC for beam search.
cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search',
# 'ctc_prefix_beam_search', 'attention_rescoring'
error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu'
num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=10, # Beam search width.
batch_size=16, # decoding batch size
ctc_weight=0.0, # ctc weight for attention rescoring decode mode.
decoding_chunk_size=-1, # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1.
simulate_streaming=False, # simulate streaming inference. Defaults to False.
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args):
super().__init__(config, args)
self.text_feature = TextFeaturizer(
unit_type=self.config.collator.unit_type,
vocab_filepath=self.config.collator.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix)
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.spm_model_prefix)
self.vocab_list = self.text_feature.vocab_list
def id2token(self, texts, texts_len, text_feature):
@ -526,7 +470,7 @@ class U2STTester(U2STTrainer):
decode_cfg = self.config.decode
bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu
stride_ms = self.config.collator.stride_ms
stride_ms = self.config.stride_ms
hyps, refs = [], []
len_refs, num_ins = 0, 0
num_frames = 0.0

@ -219,33 +219,6 @@ class SpeechCollatorBase():
class SpeechCollator(SpeechCollatorBase):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
default = CfgNode(
dict(
augmentation_config="",
random_seed=0,
mean_std_filepath="",
unit_type="char",
vocab_filepath="",
spm_model_prefix="",
spectrum_type='linear', # 'linear', 'mfcc', 'fbank'
feat_dim=0, # 'mfcc', 'fbank'
delta_delta=False, # 'mfcc', 'fbank'
stride_ms=10.0, # ms
window_ms=20.0, # ms
n_fft=None, # fft points
max_freq=None, # None for samplerate/2
target_sample_rate=16000, # target sample rate
use_dB_normalization=True,
target_dB=-20,
dither=1.0, # feature dither
keep_transcription_text=False))
if config is not None:
config.merge_from_other_cfg(default)
return default
@classmethod
def from_config(cls, config):
"""Build a SpeechCollator object from a config.

@ -28,22 +28,6 @@ logger = Log(__name__).getlog()
class ManifestDataset(Dataset):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
default = CfgNode(
dict(
manifest="",
max_input_len=27.0,
min_input_len=0.0,
max_output_len=float('inf'),
min_output_len=0.0,
max_output_input_ratio=float('inf'),
min_output_input_ratio=0.0, ))
if config is not None:
config.merge_from_other_cfg(default)
return default
@classmethod
def from_config(cls, config):
"""Build a ManifestDataset object from a config.

@ -119,21 +119,6 @@ class DeepSpeech2Model(nn.Layer):
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
default = CfgNode(
dict(
num_conv_layers=2, #Number of stacking convolution layers.
num_rnn_layers=3, #Number of stacking RNN layers.
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
use_gru=True, #Use gru if set True. Use simple rnn if set False.
share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
ctc_grad_norm_type=None, ))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self,
feat_size,
dict_size,

@ -243,23 +243,6 @@ class DeepSpeech2ModelOnline(nn.Layer):
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
default = CfgNode(
dict(
num_conv_layers=2, #Number of stacking convolution layers.
num_rnn_layers=4, #Number of stacking RNN layers.
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=True, #Use gru if set True. Use simple rnn if set False.
blank_id=0, # index of blank in vocob.txt
ctc_grad_norm_type=None, ))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(
self,
feat_size,

@ -59,57 +59,6 @@ logger = Log(__name__).getlog()
class U2BaseModel(ASRInterface, nn.Layer):
"""CTC-Attention hybrid Encoder-Decoder model"""
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# network architecture
default = CfgNode()
# allow add new item when merge_with_file
default.cmvn_file = ""
default.cmvn_file_type = "json"
default.input_dim = 0
default.output_dim = 0
# encoder related
default.encoder = 'transformer'
default.encoder_conf = CfgNode(
dict(
output_size=256, # dimension of attention
attention_heads=4,
linear_units=2048, # the number of units of position-wise feed forward
num_blocks=12, # the number of encoder blocks
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before=True,
# use_cnn_module=True,
# cnn_module_kernel=15,
# activation_type='swish',
# pos_enc_layer_type='rel_pos',
# selfattention_layer_type='rel_selfattn',
))
# decoder related
default.decoder = 'transformer'
default.decoder_conf = CfgNode(
dict(
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
self_attention_dropout_rate=0.0,
src_attention_dropout_rate=0.0, ))
# hybrid CTC/attention
default.model_conf = CfgNode(
dict(
ctc_weight=0.3,
lsm_weight=0.1, # label smoothing option
length_normalized_loss=False, ))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self,
vocab_size: int,
encoder: TransformerEncoder,

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""U2 ASR Model
Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
(https://arxiv.org/pdf/2012.05481.pdf)
"""
import time
@ -51,58 +51,6 @@ logger = Log(__name__).getlog()
class U2STBaseModel(nn.Layer):
"""CTC-Attention hybrid Encoder-Decoder model"""
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# network architecture
default = CfgNode()
# allow add new item when merge_with_file
default.cmvn_file = ""
default.cmvn_file_type = "json"
default.input_dim = 0
default.output_dim = 0
# encoder related
default.encoder = 'transformer'
default.encoder_conf = CfgNode(
dict(
output_size=256, # dimension of attention
attention_heads=4,
linear_units=2048, # the number of units of position-wise feed forward
num_blocks=12, # the number of encoder blocks
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before=True,
# use_cnn_module=True,
# cnn_module_kernel=15,
# activation_type='swish',
# pos_enc_layer_type='rel_pos',
# selfattention_layer_type='rel_selfattn',
))
# decoder related
default.decoder = 'transformer'
default.decoder_conf = CfgNode(
dict(
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
self_attention_dropout_rate=0.0,
src_attention_dropout_rate=0.0, ))
# hybrid CTC/attention
default.model_conf = CfgNode(
dict(
asr_weight=0.0,
ctc_weight=0.0,
lsm_weight=0.1, # label smoothing option
length_normalized_loss=False, ))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self,
vocab_size: int,
encoder: TransformerEncoder,
@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer):
simulate_streaming (bool, optional): streaming or not. Defaults to False.
Returns:
Tuple[paddle.Tensor, paddle.Tensor]:
encoder hiddens (B, Tmax, D),
Tuple[paddle.Tensor, paddle.Tensor]:
encoder hiddens (B, Tmax, D),
encoder hiddens mask (B, 1, Tmax).
"""
# Let's assume B = batch_size
@ -533,21 +481,21 @@ class U2STBaseModel(nn.Layer):
feats (Tenosr): audio features, (B, T, D)
feats_lengths (Tenosr): (B)
text_feature (TextFeaturizer): text feature object.
decoding_method (str): decoding mode, e.g.
'fullsentence',
decoding_method (str): decoding mode, e.g.
'fullsentence',
'simultaneous'
beam_size (int): beam size for search
decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
<0: for decoding, use full chunk.
>0: for decoding, use fixed chunk size as set.
0: used for training, it's prohibited here.
num_decoding_left_chunks (int, optional):
0: used for training, it's prohibited here.
num_decoding_left_chunks (int, optional):
number of left chunks for decoding. Defaults to -1.
simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
Raises:
ValueError: when not support decoding_method.
Returns:
List[List[int]]: transcripts.
"""
@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel):
ValueError: raise when using not support encoder type.
Returns:
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
if configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'],

Loading…
Cancel
Save