fix ds2 scripts and bugs

pull/578/head
Hui Zhang 4 years ago
parent 09ab9f717e
commit b69021f9e6

@ -1,50 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inferer for DeepSpeech2 model."""
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments
# TODO(hui zhang): dynamic load
def main_sp(config, args):
exp = Tester(config, args)
exp.setup()
exp.run_test()
def main(config, args):
main_sp(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
main(config, args)

@ -113,6 +113,7 @@ class DeepSpeech2Trainer(Trainer):
if self.parallel: if self.parallel:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
logger.info(f"{model}")
layer_tools.print_params(model, logger.info) layer_tools.print_params(model, logger.info)
grad_clip = ClipGradByGlobalNormWithLog( grad_clip = ClipGradByGlobalNormWithLog(
@ -192,7 +193,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def compute_metrics(self, audio, texts, audio_len, texts_len): def compute_metrics(self, audio, audio_len, texts, texts_len):
cfg = self.config.decoding cfg = self.config.decoding
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -253,7 +254,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg = "Test: " msg = "Test: "
msg += "epoch: {}, ".format(self.epoch) msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += ", Final error rate [%s] (%d/%d) = %f" % ( msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs) error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg) logger.info(msg)
@ -319,8 +320,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
config.defrost() config.defrost()
# return raw text # return raw text
config.data.manifest = config.data.test_manifest config.data.keep_transcription_text = True
config.data.augmentation_config = "" config.data.augmentation_config = ""
config.data.manifest = config.data.test_manifest
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id

@ -131,8 +131,8 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20): def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file.""" """Load mean and std from file."""
mean, istd = load_cmvn(filepath, filetype='json') mean, istd = load_cmvn(filepath, filetype='json')
self._mean = mean self._mean = np.expand_dims(mean, axis=-1)
self._istd = istd self._istd = np.expand_dims(istd, axis=-1)
def write_to_file(self, filepath): def write_to_file(self, filepath):
"""Write the mean and stddev to the file. """Write the mean and stddev to the file.

@ -60,7 +60,7 @@ class SpeechCollator():
# else text is string, convert to unicode ord # else text is string, convert to unicode ord
tokens = [] tokens = []
if self._keep_transcription_text: if self._keep_transcription_text:
assert isinstance(text, str), type(text) assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text] tokens = [ord(t) for t in text]
else: else:
tokens = text # token ids tokens = text # token ids

@ -154,9 +154,9 @@ class DeepSpeech2Model(nn.Layer):
assert (self.encoder.output_size == rnn_size * 2) assert (self.encoder.output_size == rnn_size * 2)
self.decoder = CTCDecoder( self.decoder = CTCDecoder(
odim=dict_size + 1, # <blank> is append after vocab odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size, enc_n_units=self.encoder.output_size,
blank_id=dict_size, # last token is <blank> blank_id=0, # first token is <blank>
dropout_rate=0.0, dropout_rate=0.0,
reduction=True, # sum reduction=True, # sum
batch_average=True) # sum / batch_size batch_average=True) # sum / batch_size

@ -63,7 +63,7 @@ class U2BaseModel(nn.Module):
default = CfgNode() default = CfgNode()
# allow add new item when merge_with_file # allow add new item when merge_with_file
default.cmvn_file = "" default.cmvn_file = ""
default.cmvn_file_type = "npz" default.cmvn_file_type = "json"
default.input_dim = 0 default.input_dim = 0
default.output_dim = 0 default.output_dim = 0
# encoder related # encoder related

@ -40,7 +40,8 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
[[1., 1., 0., 0.], [[1., 1., 0., 0.],
[1., 1., 1., 1.]] [1., 1., 1., 1.]]
""" """
assert x_len.dim() == 1 # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
# assert x_len.dim() == 1, (x_len.dim(), x_len)
max_len = max_len or x_len.max() max_len = max_len or x_len.max()
x_len = paddle.unsqueeze(x_len, -1) x_len = paddle.unsqueeze(x_len, -1)
row_vector = paddle.arange(max_len) row_vector = paddle.arange(max_len)

@ -127,7 +127,12 @@ class Trainer():
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None): def save(self, tag=None, infos: dict=None):
"""Save checkpoint (model parameters and optimizer states). """Save checkpoint (model parameters and optimizer states).
Args:
tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
infos (dict, optional): meta data to save. Defaults to None.
""" """
infos = infos if infos else dict() infos = infos if infos else dict()
infos.update({ infos.update({
"step": self.iteration, "step": self.iteration,
@ -220,7 +225,7 @@ class Trainer():
'epoch', {'cv_loss': cv_loss, 'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch) 'lr': self.lr_scheduler()}, self.epoch)
self.save(infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.lr_scheduler.step() self.lr_scheduler.step()
self.new_epoch() self.new_epoch()

@ -7,14 +7,20 @@ data:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 64 # one gpu batch_size: 64 # one gpu
max_duration: 27.0 min_input_len: 0.0
min_duration: 0.0 max_input_len: 27.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 20.0
delta_delta: False
dither: 1.0
use_dB_normalization: True use_dB_normalization: True
target_dB: -20 target_dB: -20
random_seed: 0 random_seed: 0
@ -36,6 +42,7 @@ training:
lr_decay: 0.83 lr_decay: 0.83
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 100
decoding: decoding:
batch_size: 128 batch_size: 128

@ -0,0 +1,23 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -43,17 +43,17 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer # compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \ --specgram_type="linear" \
--feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=25.0 \ --window_ms=20.0 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_samples=-1 \ --num_samples=-1 \
--num_workers=16 \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -1,18 +1,32 @@
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: export ckpt_path jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_path_prefix=$2
jit_model_export_path=$3
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--config conf/deepspeech2.yaml \ --device ${device} \
--checkpoint_path ${1} \ --nproc ${ngpu} \
--export_path ${2} --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in export!"
exit 1 exit 1
fi fi

@ -1,25 +1,35 @@
#! /usr/bin/env bash #! /usr/bin/env bash
if [[ $# != 1 ]]; then if [ $# != 2 ];then
echo "usage: $0 ckpt-path" echo "usage: ${0} config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1
ckpt_prefix=$2
# download language model # download language model
bash local/download_lm_ch.sh bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
python3 -u ${BIN_DIR}/infer.py \ python3 -u ${BIN_DIR}/test.py \
--device 'gpu' \ --device ${device} \
--nproc 1 \ --nproc 1 \
--config conf/deepspeech2.yaml \ --config ${config_path} \
--checkpoint_path ${1} --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in inference!" echo "Failed in evaluation!"
exit 1 exit 1
fi fi

@ -1,23 +1,32 @@
#! /usr/bin/env bash #! /usr/bin/env bash
# train model if [ $# != 2 ];then
# if you wish to resume from an exists model, uncomment --init_from_pretrained_model echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
#export FLAGS_sync_nccl_allreduce=0 exit -1
fi
ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device 'gpu' \ --device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config conf/deepspeech2.yaml \ --config ${config_path} \
--output ckpt-${1} --output exp/${ckpt_name}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
exit 1 exit 1
fi fi
exit 0 exit 0

@ -1,19 +1,38 @@
#!/bin/bash #!/bin/bash
set -e
source path.sh source path.sh
# only demos
gpus=0
stage=0
stop_stage=100
conf_path=conf/deepspeech2.yaml
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
avg_num=1
avg_ckpt=avg_${avg_num}
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data # prepare data
bash ./local/data.sh bash ./local/data.sh || exit -1
fi
# train model if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh baseline # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
# test model if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi
# infer model if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
# export model if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model # export ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi

@ -1 +0,0 @@
../../s0/local/data.sh

@ -0,0 +1,86 @@
#! /usr/bin/env bash
stage=-1
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/aishell/aishell.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell"
if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated."
exit 1
fi
for dataset in train dev test; do
mv data/manifest.${dataset} data/manifest.${dataset}.raw
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for dataset in train dev test; do
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
done
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
fi
echo "Aishell data preparation done."
exit 0

@ -24,6 +24,7 @@ import distutils.util
import io import io
import json import json
import os import os
from multiprocessing.pool import Pool
import soundfile import soundfile
@ -122,42 +123,36 @@ def main():
if args.target_dir.startswith('~'): if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir) args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset( tasks = [
url=URL_TEST_CLEAN, (URL_TEST_CLEAN, MD5_TEST_CLEAN, os.path.join(args.target_dir,
md5sum=MD5_TEST_CLEAN, "test-clean"),
target_dir=os.path.join(args.target_dir, "test-clean"), args.manifest_prefix + ".test-clean"),
manifest_path=args.manifest_prefix + ".test-clean") (URL_DEV_CLEAN, MD5_DEV_CLEAN, os.path.join(
prepare_dataset( args.target_dir, "dev-clean"), args.manifest_prefix + ".dev-clean"),
url=URL_DEV_CLEAN, ]
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean")
if args.full_download: if args.full_download:
prepare_dataset( tasks.extend([
url=URL_TRAIN_CLEAN_100, (URL_TRAIN_CLEAN_100, MD5_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100, os.path.join(args.target_dir, "train-clean-100"),
target_dir=os.path.join(args.target_dir, "train-clean-100"), args.manifest_prefix + ".train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100") (URL_TEST_OTHER, MD5_TEST_OTHER, os.path.join(args.target_dir,
prepare_dataset( "test-other"),
url=URL_TEST_OTHER, args.manifest_prefix + ".test-other"),
md5sum=MD5_TEST_OTHER, (URL_DEV_OTHER, MD5_DEV_OTHER, os.path.join(args.target_dir,
target_dir=os.path.join(args.target_dir, "test-other"), "dev-other"),
manifest_path=args.manifest_prefix + ".test-other") args.manifest_prefix + ".dev-other"),
prepare_dataset( (URL_TRAIN_CLEAN_360, MD5_TRAIN_CLEAN_360,
url=URL_DEV_OTHER, os.path.join(args.target_dir, "train-clean-360"),
md5sum=MD5_DEV_OTHER, args.manifest_prefix + ".train-clean-360"),
target_dir=os.path.join(args.target_dir, "dev-other"), (URL_TRAIN_OTHER_500, MD5_TRAIN_OTHER_500,
manifest_path=args.manifest_prefix + ".dev-other") os.path.join(args.target_dir, "train-other-500"),
prepare_dataset( args.manifest_prefix + ".train-other-500"),
url=URL_TRAIN_CLEAN_360, ])
md5sum=MD5_TRAIN_CLEAN_360,
target_dir=os.path.join(args.target_dir, "train-clean-360"), with Pool(7) as pool:
manifest_path=args.manifest_prefix + ".train-clean-360") pool.starmap(prepare_dataset, tasks)
prepare_dataset(
url=URL_TRAIN_OTHER_500, print("Data download and manifest prepare done!")
md5sum=MD5_TRAIN_OTHER_500,
target_dir=os.path.join(args.target_dir, "train-other-500"),
manifest_path=args.manifest_prefix + ".train-other-500")
if __name__ == '__main__': if __name__ == '__main__':

@ -23,6 +23,7 @@ import codecs
import io import io
import json import json
import os import os
from multiprocessing.pool import Pool
import soundfile import soundfile
@ -103,16 +104,18 @@ def main():
if args.target_dir.startswith('~'): if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir) args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset( tasks = [
url=URL_TRAIN_CLEAN, (URL_TRAIN_CLEAN, MD5_TRAIN_CLEAN,
md5sum=MD5_TRAIN_CLEAN, os.path.join(args.target_dir, "train-clean"),
target_dir=os.path.join(args.target_dir, "train-clean"), args.manifest_prefix + ".train-clean"),
manifest_path=args.manifest_prefix + ".train-clean") (URL_DEV_CLEAN, MD5_DEV_CLEAN, os.path.join(
prepare_dataset( args.target_dir, "dev-clean"), args.manifest_prefix + ".dev-clean"),
url=URL_DEV_CLEAN, ]
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"), with Pool(2) as pool:
manifest_path=args.manifest_prefix + ".dev-clean") pool.starmap(prepare_dataset, tasks)
print("Data download and manifest prepare done!")
if __name__ == '__main__': if __name__ == '__main__':

@ -7,14 +7,20 @@ data:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 20 batch_size: 20
max_duration: 27.0 min_input_len: 0.0
min_duration: 0.0 max_input_len: 27.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 20.0
delta_delta: False
dither: 1.0
use_dB_normalization: True use_dB_normalization: True
target_dB: -20 target_dB: -20
random_seed: 0 random_seed: 0
@ -22,18 +28,22 @@ data:
sortagrad: True sortagrad: True
shuffle_method: batch_shuffle shuffle_method: batch_shuffle
num_workers: 0 num_workers: 0
model: model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 50 n_epoch: 50
lr: 1e-3 lr: 1e-3
lr_decay: 0.83 lr_decay: 0.83
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 100
decoding: decoding:
batch_size: 128 batch_size: 128
error_rate_type: wer error_rate_type: wer

@ -0,0 +1,23 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -1,11 +1,19 @@
#! /usr/bin/env bash #! /usr/bin/env bash
stage=-1
stop_stage=100
unit_type=char
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR} mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests # download data, generate manifests
PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \ python3 ${TARGET_DIR}/librispeech/librispeech.py \
--manifest_prefix="data/manifest" \ --manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/librispeech" \ --target_dir="${TARGET_DIR}/librispeech" \
--full_download="True" --full_download="True"
@ -15,30 +23,80 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
cat data/manifest.train-* | shuf > data/manifest.train for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${set} data/manifest.${set}.raw
done
for set in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw
done
for set in dev-clean dev-other; do
cat data/manifest.${set}.raw >> data/manifest.dev.raw
done
for set in test-clean test-other; do
cat data/manifest.${set}.raw >> data/manifest.test.raw
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# build vocabulary # build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \ python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type ${unit_type} \
--count_threshold=0 \ --count_threshold=0 \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.train" --manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated." echo "Build vocabulary failed. Terminated."
exit 1 exit 1
fi fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer # compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train" \ --manifest_path="data/manifest.train.raw" \
--num_samples=2000 \ --num_samples=-1 \
--specgram_type="linear" \ --specgram_type="linear" \
--output_path="data/mean_std.npz" --delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=20.0 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated." echo "Compute mean and stddev failed. Terminated."
exit 1 exit 1
fi fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest.${set} failed. Terminated."
exit 1
fi
}&
done
wait
fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."
exit 0 exit 0

@ -0,0 +1,3 @@
data
exp
*log

@ -3,18 +3,24 @@ data:
train_manifest: data/manifest.tiny train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny test_manifest: data/manifest.tiny
mean_std_filepath: data/mean_std.npz mean_std_filepath: data/mean_std.json
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
max_duration: 27.0 min_input_len: 0.0
min_duration: 0.0 max_input_len: 27.0
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 20.0
delta_delta: False
dither: 1.0
use_dB_normalization: True use_dB_normalization: True
target_dB: -20 target_dB: -20
random_seed: 0 random_seed: 0
@ -22,18 +28,22 @@ data:
sortagrad: True sortagrad: True
shuffle_method: batch_shuffle shuffle_method: batch_shuffle
num_workers: 0 num_workers: 0
model: model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 20 n_epoch: 20
lr: 1e-5 lr: 1e-5
lr_decay: 1.0 lr_decay: 1.0
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 1
decoding: decoding:
batch_size: 128 batch_size: 128
error_rate_type: wer error_rate_type: wer

@ -0,0 +1,23 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -3,10 +3,7 @@
stage=-1 stage=-1
stop_stage=100 stop_stage=100
# bpemode (unigram or bpe) unit_type=char
nbpe=200
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
@ -32,10 +29,8 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# build vocabulary # build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \ python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "spm" \ --unit_type ${unit_type} \
--spm_vocab_size=${nbpe} \ --count_threshold=0 \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw" --manifest_paths="data/manifest.tiny.raw"
@ -51,12 +46,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \ --num_samples=64 \
--specgram_type="fbank" \ --specgram_type="linear" \
--feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=25.0 \ --window_ms=20.0 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_workers=2 \ --num_workers=2 \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
@ -73,8 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \ --feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "spm" \ --unit_type ${unit_type} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny" --output_path="data/manifest.tiny"

@ -2,6 +2,7 @@
set -e set -e
source path.sh source path.sh
gpus=0
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
@ -18,7 +19,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=0 ./local/train.sh ${conf_path} ${ckpt} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -28,10 +29,10 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi

@ -1 +0,0 @@
../../s0/local/data.sh

@ -0,0 +1,90 @@
#! /usr/bin/env bash
stage=-1
stop_stage=100
# bpemode (unigram or bpe)
nbpe=200
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/librispeech/librispeech.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/librispeech" \
--full_download="False"
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
head -n 64 data/manifest.dev-clean > data/manifest.tiny.raw
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "spm" \
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \
--specgram_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
fi
echo "LibriSpeech Data preparation done."
exit 0
Loading…
Cancel
Save