Merge pull request #2518 from Zth9730/wav2vec2.0
[ASR] wav2vec2 ASR, pre-trained wav2vec2 based CTC for librispeechpull/2544/head
commit
f1ca564731
@ -0,0 +1,8 @@
|
|||||||
|
# LibriSpeech
|
||||||
|
|
||||||
|
## Wav2VecASR
|
||||||
|
train: Epoch 1, 1*V100-32G, batchsize:10
|
||||||
|
|
||||||
|
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
||||||
|
| --- | --- | --- | --- | --- | --- | --- |
|
||||||
|
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 |
|
@ -0,0 +1,4 @@
|
|||||||
|
process:
|
||||||
|
# use raw audio
|
||||||
|
- type: wav_process
|
||||||
|
dither: 0.0
|
@ -0,0 +1,4 @@
|
|||||||
|
decode_batch_size: 1
|
||||||
|
error_rate_type: wer
|
||||||
|
decoding_method: ctc_greedy_search # 'ctc_greedy_search', 'ctc_prefix_beam_search'
|
||||||
|
beam_size: 10
|
@ -0,0 +1,120 @@
|
|||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
freeze_wav2vec2: True
|
||||||
|
normalize_wav: True
|
||||||
|
output_norm: True
|
||||||
|
dnn_blocks: 2
|
||||||
|
dnn_neurons: 1024
|
||||||
|
blank_id: 0
|
||||||
|
ctc_dropout_rate: 0.0
|
||||||
|
wav2vec2_params_path: "exp/wav2vec2/wav2vec2-large-960h-lv60-self.pdparams"
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Wav2Vec2.0 #
|
||||||
|
############################################
|
||||||
|
vocab_size: 32
|
||||||
|
hidden_size: 1024
|
||||||
|
num_hidden_layers: 24
|
||||||
|
num_attention_heads: 16
|
||||||
|
intermediate_size: 4096
|
||||||
|
hidden_act: "gelu"
|
||||||
|
hidden_dropout: 0.1
|
||||||
|
activation_dropout: 0.1
|
||||||
|
attention_dropout: 0.1
|
||||||
|
feat_proj_dropout: 0.1
|
||||||
|
feat_quantizer_dropout: 0.0
|
||||||
|
final_dropout: 0.1
|
||||||
|
layerdrop: 0.1
|
||||||
|
initializer_range: 0.02
|
||||||
|
layer_norm_eps: 1e-5
|
||||||
|
feat_extract_norm: "layer"
|
||||||
|
feat_extract_activation: "gelu"
|
||||||
|
conv_dim: [512, 512, 512, 512, 512, 512, 512]
|
||||||
|
conv_stride: [5, 2, 2, 2, 2, 2, 2]
|
||||||
|
conv_kernel: [10, 3, 3, 3, 3, 2, 2]
|
||||||
|
conv_bias: True
|
||||||
|
num_conv_pos_embeddings: 128
|
||||||
|
num_conv_pos_embedding_groups: 16
|
||||||
|
do_stable_layer_norm: True
|
||||||
|
apply_spec_augment: False
|
||||||
|
mask_time_prob: 0.05
|
||||||
|
mask_time_length: 10
|
||||||
|
mask_time_min_masks: 2
|
||||||
|
mask_feature_prob: 0.0
|
||||||
|
mask_feature_length: 10
|
||||||
|
mask_feature_min_masks: 0
|
||||||
|
num_codevectors_per_group: 320
|
||||||
|
num_codevector_groups: 2
|
||||||
|
contrastive_logits_temperature: 0.1
|
||||||
|
num_negatives: 100
|
||||||
|
codevector_dim: 256
|
||||||
|
proj_codevector_dim: 256
|
||||||
|
diversity_loss_weight: 0.1
|
||||||
|
ctc_loss_reduction: "sum"
|
||||||
|
ctc_zero_infinity: False
|
||||||
|
use_weighted_layer_sum: False
|
||||||
|
pad_token_id: 0
|
||||||
|
bos_token_id: 1
|
||||||
|
eos_token_id: 2
|
||||||
|
add_adapter: False
|
||||||
|
adapter_kernel_size: 3
|
||||||
|
adapter_stride: 2
|
||||||
|
num_adapter_layers: 3
|
||||||
|
output_hidden_size: None
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.train
|
||||||
|
dev_manifest: data/manifest.dev
|
||||||
|
test_manifest: data/manifest.test-clean
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_char/vocab.txt
|
||||||
|
unit_type: 'char'
|
||||||
|
mean_std_filepath: ""
|
||||||
|
preprocess_config: conf/preprocess.yaml
|
||||||
|
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs
|
||||||
|
batch_size: 10 # Different batch_size may cause large differences in results
|
||||||
|
maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced
|
||||||
|
maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
dist_sampler: True
|
||||||
|
shortest_first: True
|
||||||
|
return_lens_rate: True
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 1
|
||||||
|
accum_grad: 1
|
||||||
|
global_grad_clip: 3.0
|
||||||
|
model_optim: adadelta
|
||||||
|
model_optim_conf:
|
||||||
|
lr: 0.9
|
||||||
|
epsilon: 1.0e-6
|
||||||
|
rho: 0.95
|
||||||
|
scheduler: constantlr
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 1
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
||||||
|
augment: True
|
||||||
|
|
||||||
|
|
@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
unit_type=char
|
||||||
|
dict_dir=data/lang_char
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||||
|
|
||||||
|
mkdir -p data
|
||||||
|
mkdir -p ${dict_dir}
|
||||||
|
TARGET_DIR=${MAIN_ROOT}/dataset
|
||||||
|
mkdir -p ${TARGET_DIR}
|
||||||
|
|
||||||
|
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||||
|
# download data, generate manifests
|
||||||
|
python3 ${TARGET_DIR}/librispeech/librispeech.py \
|
||||||
|
--manifest_prefix="data/manifest" \
|
||||||
|
--target_dir="${TARGET_DIR}/librispeech" \
|
||||||
|
--full_download="True"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Prepare LibriSpeech failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
|
mv data/manifest.${set} data/manifest.${set}.raw
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
|
||||||
|
for set in train-clean-100 train-clean-360 train-other-500; do
|
||||||
|
cat data/manifest.${set}.raw >> data/manifest.train.raw
|
||||||
|
done
|
||||||
|
|
||||||
|
for set in dev-clean dev-other; do
|
||||||
|
cat data/manifest.${set}.raw >> data/manifest.dev.raw
|
||||||
|
done
|
||||||
|
|
||||||
|
for set in test-clean test-other; do
|
||||||
|
cat data/manifest.${set}.raw >> data/manifest.test.raw
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# compute mean and stddev for normalizer
|
||||||
|
num_workers=$(nproc)
|
||||||
|
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
|
||||||
|
--manifest_path="data/manifest.train.raw" \
|
||||||
|
--num_samples=2000 \
|
||||||
|
--spectrum_type="fbank" \
|
||||||
|
--feat_dim=161 \
|
||||||
|
--delta_delta=false \
|
||||||
|
--sample_rate=16000 \
|
||||||
|
--stride_ms=10 \
|
||||||
|
--window_ms=25 \
|
||||||
|
--use_dB_normalization=False \
|
||||||
|
--num_workers=${num_workers} \
|
||||||
|
--output_path="data/mean_std.json"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Compute mean and stddev failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# build vocabulary
|
||||||
|
python3 ${MAIN_ROOT}/utils/build_vocab.py \
|
||||||
|
--unit_type ${unit_type} \
|
||||||
|
--count_threshold=0 \
|
||||||
|
--vocab_path="${dict_dir}/vocab.txt" \
|
||||||
|
--manifest_paths="data/manifest.train.raw"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Build vocabulary failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# format manifest with tokenids, vocab size
|
||||||
|
for set in train dev test dev-clean dev-other test-clean test-other; do
|
||||||
|
{
|
||||||
|
python3 ${MAIN_ROOT}/utils/format_data.py \
|
||||||
|
--cmvn_path "data/mean_std.json" \
|
||||||
|
--unit_type ${unit_type} \
|
||||||
|
--vocab_path="${dict_dir}/vocab.txt" \
|
||||||
|
--manifest_path="data/manifest.${set}.raw" \
|
||||||
|
--output_path="data/manifest.${set}"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Formt mnaifest.${set} failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}&
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "LibriSpeech Data preparation done."
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
mkdir -p exp/wav2vec2
|
||||||
|
echo "Pretrained wav2vec2 model download"
|
||||||
|
wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/wav2vec2-large-960h-lv60-self.pdparams
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,84 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
expdir=exp
|
||||||
|
datadir=data
|
||||||
|
|
||||||
|
train_set=train_960
|
||||||
|
recog_set="test-clean test-other dev-clean dev-other"
|
||||||
|
recog_set="test-clean"
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
decode_config_path=$2
|
||||||
|
ckpt_prefix=$3
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
# download language model
|
||||||
|
#bash local/download_lm_en.sh
|
||||||
|
#if [ $? -ne 0 ]; then
|
||||||
|
# exit 1
|
||||||
|
#fi
|
||||||
|
|
||||||
|
python3 utils/format_rsl.py \
|
||||||
|
--origin_ref data/manifest.test-clean.raw \
|
||||||
|
--trans_ref data/manifest.test-clean.text
|
||||||
|
|
||||||
|
|
||||||
|
for type in ctc_greedy_search; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=16
|
||||||
|
python3 -u ${BIN_DIR}/test.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
python3 utils/format_rsl.py \
|
||||||
|
--origin_hyp ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
|
||||||
|
|
||||||
|
python3 utils/compute-wer.py --char=1 --v=1 \
|
||||||
|
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
|
||||||
|
echo "decoding ${type} done."
|
||||||
|
done
|
||||||
|
|
||||||
|
for type in ctc_prefix_beam_search; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=1
|
||||||
|
python3 -u ${BIN_DIR}/test.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
python3 utils/format_rsl.py \
|
||||||
|
--origin_hyp ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
|
||||||
|
|
||||||
|
python3 utils/compute-wer.py --char=1 --v=1 \
|
||||||
|
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
|
||||||
|
echo "decoding ${type} done."
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Finished"
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,58 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# != 4 ];then
|
||||||
|
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
decode_config_path=$2
|
||||||
|
ckpt_prefix=$3
|
||||||
|
audio_file=$4
|
||||||
|
|
||||||
|
mkdir -p data
|
||||||
|
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ${audio_file} ]; then
|
||||||
|
echo "Plase input the right audio_file path"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
chunk_mode=false
|
||||||
|
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||||
|
chunk_mode=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# download language model
|
||||||
|
#bash local/download_lm_ch.sh
|
||||||
|
#if [ $? -ne 0 ]; then
|
||||||
|
# exit 1
|
||||||
|
#fi
|
||||||
|
|
||||||
|
for type in ctc_greedy_search; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=1
|
||||||
|
output_dir=${ckpt_prefix}
|
||||||
|
mkdir -p ${output_dir}
|
||||||
|
python3 -u ${BIN_DIR}/test_wav.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${output_dir}/${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size} \
|
||||||
|
--audio_file ${audio_file}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
exit 0
|
@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -lt 2 ] && [ $# -gt 3 ];then
|
||||||
|
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
ckpt_name=$2
|
||||||
|
ips=$3
|
||||||
|
|
||||||
|
if [ ! $ips ];then
|
||||||
|
ips_config=
|
||||||
|
else
|
||||||
|
ips_config="--ips="${ips}
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p exp
|
||||||
|
|
||||||
|
# seed may break model convergence
|
||||||
|
seed=1998
|
||||||
|
if [ ${seed} != 0 ]; then
|
||||||
|
export FLAGS_cudnn_deterministic=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
# export FLAGS_cudnn_exhaustive_search=true
|
||||||
|
# export FLAGS_conv_workspace_size_limit=4000
|
||||||
|
export FLAGS_allocator_strategy=naive_best_fit
|
||||||
|
if [ ${ngpu} == 0 ]; then
|
||||||
|
python3 -u ${BIN_DIR}/train.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--output exp/${ckpt_name} \
|
||||||
|
--seed ${seed}
|
||||||
|
else
|
||||||
|
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--output exp/${ckpt_name} \
|
||||||
|
--seed ${seed}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${seed} != 0 ]; then
|
||||||
|
unset FLAGS_cudnn_deterministic
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in training!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,15 @@
|
|||||||
|
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
export PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||||
|
|
||||||
|
|
||||||
|
MODEL=wav2vec2
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
|
@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
. ./path.sh || exit 1;
|
||||||
|
. ./cmd.sh || exit 1;
|
||||||
|
|
||||||
|
gpus=0
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
conf_path=conf/wav2vec2ASR.yaml
|
||||||
|
ips= #xx.xx.xx.xx,xx.xx.xx.xx
|
||||||
|
decode_conf_path=conf/tuning/decode.yaml
|
||||||
|
avg_num=1
|
||||||
|
dict_path=data/lang_char/vocab.txt
|
||||||
|
|
||||||
|
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
audio_file=data/demo_002_en.wav
|
||||||
|
|
||||||
|
avg_ckpt=avg_${avg_num}
|
||||||
|
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||||
|
echo "checkpoint name ${ckpt}"
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
bash ./local/data.sh || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `exp` dir
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# avg n best model
|
||||||
|
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# greedy search decoder
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
# test a single .wav file
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
|
||||||
|
fi
|
@ -0,0 +1 @@
|
|||||||
|
../../../utils
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,64 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Evaluation for wav2vec2.0 model."""
|
||||||
|
import cProfile
|
||||||
|
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
|
||||||
|
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||||
|
from paddlespeech.s2t.utils.utility import print_arguments
|
||||||
|
|
||||||
|
|
||||||
|
def main_sp(config, args):
|
||||||
|
exp = Tester(config, args)
|
||||||
|
with exp.eval():
|
||||||
|
exp.setup()
|
||||||
|
exp.run_test()
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
main_sp(config, args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
# save asr result to
|
||||||
|
parser.add_argument(
|
||||||
|
'--dict-path', type=str, default=None, help='dict path.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_file", type=str, help="path of save the asr result")
|
||||||
|
args = parser.parse_args()
|
||||||
|
print_arguments(args, globals())
|
||||||
|
|
||||||
|
# https://yaml.org/type/float.html
|
||||||
|
config = CfgNode(new_allowed=True)
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.decode_cfg:
|
||||||
|
decode_confs = CfgNode(new_allowed=True)
|
||||||
|
decode_confs.merge_from_file(args.decode_cfg)
|
||||||
|
config.decode = decode_confs
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
print(config)
|
||||||
|
if args.dump_config:
|
||||||
|
with open(args.dump_config, 'w') as f:
|
||||||
|
print(config, file=f)
|
||||||
|
|
||||||
|
# Setting for profiling
|
||||||
|
pr = cProfile.Profile()
|
||||||
|
pr.runcall(main, config, args)
|
||||||
|
pr.dump_stats('test.profile')
|
@ -0,0 +1,118 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Evaluation for wav2vec2.0 model."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import soundfile
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
|
||||||
|
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||||
|
from paddlespeech.s2t.utils.log import Log
|
||||||
|
from paddlespeech.s2t.utils.utility import UpdateConfig
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2vec2Infer():
|
||||||
|
def __init__(self, config, args):
|
||||||
|
self.args = args
|
||||||
|
self.config = config
|
||||||
|
self.audio_file = args.audio_file
|
||||||
|
|
||||||
|
self.text_feature = TextFeaturizer(
|
||||||
|
unit_type=config.unit_type, vocab=config.vocab_filepath)
|
||||||
|
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
|
||||||
|
|
||||||
|
# model
|
||||||
|
model_conf = config
|
||||||
|
with UpdateConfig(model_conf):
|
||||||
|
model_conf.output_dim = self.text_feature.vocab_size
|
||||||
|
model = Wav2vec2ASR.from_config(model_conf)
|
||||||
|
self.model = model
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
# load model
|
||||||
|
params_path = self.args.checkpoint_path + ".pdparams"
|
||||||
|
model_dict = paddle.load(params_path)
|
||||||
|
self.model.set_state_dict(model_dict)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
check(args.audio_file)
|
||||||
|
|
||||||
|
with paddle.no_grad():
|
||||||
|
# read
|
||||||
|
audio, _ = soundfile.read(
|
||||||
|
self.audio_file, dtype="int16", always_2d=True)
|
||||||
|
logger.info(f"audio shape: {audio.shape}")
|
||||||
|
|
||||||
|
xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
|
||||||
|
decode_config = self.config.decode
|
||||||
|
result_transcripts, result_tokenids = self.model.decode(
|
||||||
|
xs,
|
||||||
|
text_feature=self.text_feature,
|
||||||
|
decoding_method=decode_config.decoding_method,
|
||||||
|
beam_size=decode_config.beam_size)
|
||||||
|
rsl = result_transcripts[0]
|
||||||
|
utt = Path(self.audio_file).name
|
||||||
|
logger.info(f"hyp: {utt} {rsl}")
|
||||||
|
return rsl
|
||||||
|
|
||||||
|
|
||||||
|
def check(audio_file):
|
||||||
|
if not os.path.isfile(audio_file):
|
||||||
|
print("Please input the right audio file path")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
logger.info("checking the audio file format......")
|
||||||
|
try:
|
||||||
|
sig, sample_rate = soundfile.read(audio_file)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
logger.error(
|
||||||
|
"can not open the wav file, please check the audio file format")
|
||||||
|
sys.exit(-1)
|
||||||
|
logger.info("The sample rate is %d" % sample_rate)
|
||||||
|
assert (sample_rate == 16000)
|
||||||
|
logger.info("The audio file format is right")
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
Wav2vec2Infer(config, args).run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
# save asr result to
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_file", type=str, help="path of save the asr result")
|
||||||
|
parser.add_argument(
|
||||||
|
"--audio_file", type=str, help="path of the input audio file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = CfgNode(new_allowed=True)
|
||||||
|
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.decode_cfg:
|
||||||
|
decode_confs = CfgNode(new_allowed=True)
|
||||||
|
decode_confs.merge_from_file(args.decode_cfg)
|
||||||
|
config.decode = decode_confs
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
main(config, args)
|
@ -0,0 +1,54 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Trainer for wav2vec2.0 model."""
|
||||||
|
import cProfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
|
||||||
|
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||||
|
from paddlespeech.s2t.utils.utility import print_arguments
|
||||||
|
|
||||||
|
|
||||||
|
def main_sp(config, args):
|
||||||
|
exp = Trainer(config, args)
|
||||||
|
exp.setup()
|
||||||
|
exp.run()
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
main_sp(config, args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
print_arguments(args, globals())
|
||||||
|
|
||||||
|
# https://yaml.org/type/float.html
|
||||||
|
config = CfgNode(new_allowed=True)
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
if args.dump_config:
|
||||||
|
with open(args.dump_config, 'w') as f:
|
||||||
|
print(config, file=f)
|
||||||
|
|
||||||
|
# Setting for profiling
|
||||||
|
pr = cProfile.Profile()
|
||||||
|
pr.runcall(main, config, args)
|
||||||
|
pr.dump_stats(os.path.join(args.output, 'train.profile'))
|
@ -0,0 +1,459 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Contains wav2vec2 model."""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
from collections import OrderedDict
|
||||||
|
from contextlib import nullcontext
|
||||||
|
|
||||||
|
import jsonlines
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from paddle import distributed as dist
|
||||||
|
|
||||||
|
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
|
||||||
|
from paddlespeech.s2t.io.dataloader import DataLoaderFactory
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
|
||||||
|
from paddlespeech.s2t.training.optimizer import OptimizerFactory
|
||||||
|
from paddlespeech.s2t.training.reporter import ObsScope
|
||||||
|
from paddlespeech.s2t.training.reporter import report
|
||||||
|
from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
|
||||||
|
from paddlespeech.s2t.training.timer import Timer
|
||||||
|
from paddlespeech.s2t.training.trainer import Trainer
|
||||||
|
from paddlespeech.s2t.utils import error_rate
|
||||||
|
from paddlespeech.s2t.utils import layer_tools
|
||||||
|
from paddlespeech.s2t.utils import mp_tools
|
||||||
|
from paddlespeech.s2t.utils.log import Log
|
||||||
|
from paddlespeech.s2t.utils.utility import UpdateConfig
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2ASRTrainer(Trainer):
|
||||||
|
def __init__(self, config, args):
|
||||||
|
super().__init__(config, args)
|
||||||
|
self.avg_train_loss = 0
|
||||||
|
|
||||||
|
def update_average(self, batch_index, loss, avg_loss):
|
||||||
|
"""Update running average of the loss.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
loss : paddle.tensor
|
||||||
|
detached loss, a single float value.
|
||||||
|
avg_loss : float
|
||||||
|
current running average.
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
avg_loss : float
|
||||||
|
The average loss.
|
||||||
|
"""
|
||||||
|
if paddle.isfinite(loss):
|
||||||
|
avg_loss -= avg_loss / (batch_index + 1)
|
||||||
|
avg_loss += float(loss) / (batch_index + 1)
|
||||||
|
return avg_loss
|
||||||
|
|
||||||
|
def train_batch(self, batch_index, batch, msg):
|
||||||
|
train_conf = self.config
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
# forward
|
||||||
|
utt, wav, wavs_lens, target, target_lens = batch
|
||||||
|
wavs_lens_rate = wavs_lens / wav.shape[1]
|
||||||
|
target_lens_rate = target_lens / target.shape[1]
|
||||||
|
wav = wav[:, :, 0]
|
||||||
|
wav = self.speech_augmentation(wav, wavs_lens_rate)
|
||||||
|
loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
|
||||||
|
# loss div by `batch_size * accum_grad`
|
||||||
|
loss /= train_conf.accum_grad
|
||||||
|
|
||||||
|
self.avg_train_loss = self.update_average(batch_index, loss,
|
||||||
|
self.avg_train_loss)
|
||||||
|
|
||||||
|
# loss backward
|
||||||
|
if (batch_index + 1) % train_conf.accum_grad != 0:
|
||||||
|
# Disable gradient synchronizations across DDP processes.
|
||||||
|
# Within this context, gradients will be accumulated on module
|
||||||
|
# variables, which will later be synchronized.
|
||||||
|
# When using cpu w/o DDP, model does not have `no_sync`
|
||||||
|
context = self.model.no_sync if (hasattr(self.model, "no_sync") and
|
||||||
|
self.parallel) else nullcontext
|
||||||
|
else:
|
||||||
|
# Used for single gpu training and DDP gradient synchronization
|
||||||
|
# processes.
|
||||||
|
context = nullcontext
|
||||||
|
with context():
|
||||||
|
loss.backward()
|
||||||
|
layer_tools.print_grads(self.model, print_func=None)
|
||||||
|
|
||||||
|
# optimizer step old
|
||||||
|
if (batch_index + 1) % train_conf.accum_grad == 0:
|
||||||
|
self.optimizer.step()
|
||||||
|
self.optimizer.clear_grad()
|
||||||
|
self.lr_scheduler.step()
|
||||||
|
self.iteration += 1
|
||||||
|
|
||||||
|
losses_np = {'loss': float(self.avg_train_loss) * train_conf.accum_grad}
|
||||||
|
iteration_time = time.time() - start
|
||||||
|
for k, v in losses_np.items():
|
||||||
|
report(k, v)
|
||||||
|
report("batch_size", self.config.batch_size)
|
||||||
|
report("accum", train_conf.accum_grad)
|
||||||
|
report("step_cost", iteration_time)
|
||||||
|
|
||||||
|
if (batch_index + 1) % train_conf.accum_grad == 0:
|
||||||
|
if dist.get_rank() == 0 and self.visualizer:
|
||||||
|
losses_np_v = losses_np.copy()
|
||||||
|
losses_np_v.update({"lr": self.lr_scheduler()})
|
||||||
|
for key, val in losses_np_v.items():
|
||||||
|
self.visualizer.add_scalar(
|
||||||
|
tag='train/' + key, value=val, step=self.iteration - 1)
|
||||||
|
|
||||||
|
@paddle.no_grad()
|
||||||
|
def valid(self):
|
||||||
|
self.model.eval()
|
||||||
|
if not self.use_streamdata:
|
||||||
|
logger.info(
|
||||||
|
f"Valid Total Examples: {len(self.valid_loader.dataset)}")
|
||||||
|
valid_losses = defaultdict(list)
|
||||||
|
num_seen_utts = 1
|
||||||
|
total_loss = 0.0
|
||||||
|
for i, batch in enumerate(self.valid_loader):
|
||||||
|
utt, wav, wavs_lens, target, target_lens = batch
|
||||||
|
wavs_lens_rate = wavs_lens / wav.shape[1]
|
||||||
|
target_lens_rate = target_lens / target.shape[1]
|
||||||
|
wav = wav[:, :, 0]
|
||||||
|
loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
|
||||||
|
|
||||||
|
if paddle.isfinite(loss):
|
||||||
|
num_utts = batch[1].shape[0]
|
||||||
|
num_seen_utts += num_utts
|
||||||
|
total_loss += float(loss) * num_utts
|
||||||
|
valid_losses['val_loss'].append(float(loss))
|
||||||
|
|
||||||
|
if (i + 1) % self.config.log_interval == 0:
|
||||||
|
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||||
|
valid_dump['val_history_loss'] = total_loss / num_seen_utts
|
||||||
|
|
||||||
|
# logging
|
||||||
|
msg = f"Valid: Rank: {dist.get_rank()}, "
|
||||||
|
msg += "epoch: {}, ".format(self.epoch)
|
||||||
|
msg += "step: {}, ".format(self.iteration)
|
||||||
|
if not self.use_streamdata:
|
||||||
|
msg += "batch: {}/{}, ".format(i + 1,
|
||||||
|
len(self.valid_loader))
|
||||||
|
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||||
|
for k, v in valid_dump.items())
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
|
logger.info('Rank {} Val info val_loss {}'.format(
|
||||||
|
dist.get_rank(), total_loss / num_seen_utts))
|
||||||
|
return total_loss, num_seen_utts
|
||||||
|
|
||||||
|
def do_train(self):
|
||||||
|
"""The training process control by step."""
|
||||||
|
# !!!IMPORTANT!!!
|
||||||
|
# Try to export the model by script, if fails, we should refine
|
||||||
|
# the code to satisfy the script export requirements
|
||||||
|
# script_model = paddle.jit.to_static(self.model)
|
||||||
|
# script_model_path = str(self.checkpoint_dir / 'init')
|
||||||
|
# paddle.jit.save(script_model, script_model_path)
|
||||||
|
|
||||||
|
self.before_train()
|
||||||
|
|
||||||
|
if not self.use_streamdata:
|
||||||
|
logger.info(
|
||||||
|
f"Train Total Examples: {len(self.train_loader.dataset)}")
|
||||||
|
while self.epoch < self.config.n_epoch:
|
||||||
|
with Timer("Epoch-Train Time Cost: {}"):
|
||||||
|
self.model.train()
|
||||||
|
try:
|
||||||
|
data_start_time = time.time()
|
||||||
|
for batch_index, batch in enumerate(self.train_loader):
|
||||||
|
dataload_time = time.time() - data_start_time
|
||||||
|
msg = "Train:"
|
||||||
|
observation = OrderedDict()
|
||||||
|
with ObsScope(observation):
|
||||||
|
report("Rank", dist.get_rank())
|
||||||
|
report("epoch", self.epoch)
|
||||||
|
report('step', self.iteration)
|
||||||
|
report("lr", self.lr_scheduler())
|
||||||
|
self.train_batch(batch_index, batch, msg)
|
||||||
|
self.after_train_batch()
|
||||||
|
report('iter', batch_index + 1)
|
||||||
|
if not self.use_streamdata:
|
||||||
|
report('total', len(self.train_loader))
|
||||||
|
report('reader_cost', dataload_time)
|
||||||
|
observation['batch_cost'] = observation[
|
||||||
|
'reader_cost'] + observation['step_cost']
|
||||||
|
observation['samples'] = observation['batch_size']
|
||||||
|
observation['ips,samples/s'] = observation[
|
||||||
|
'batch_size'] / observation['batch_cost']
|
||||||
|
for k, v in observation.items():
|
||||||
|
msg += f" {k.split(',')[0]}: "
|
||||||
|
msg += f"{v:>.8f}" if isinstance(v,
|
||||||
|
float) else f"{v}"
|
||||||
|
msg += f" {k.split(',')[1]}" if len(
|
||||||
|
k.split(',')) == 2 else ""
|
||||||
|
msg += ","
|
||||||
|
msg = msg[:-1] # remove the last ","
|
||||||
|
if (batch_index + 1) % self.config.log_interval == 0:
|
||||||
|
logger.info(msg)
|
||||||
|
data_start_time = time.time()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
raise e
|
||||||
|
with Timer("Eval Time Cost: {}"):
|
||||||
|
total_loss, num_seen_utts = self.valid()
|
||||||
|
if dist.get_world_size() > 1:
|
||||||
|
num_seen_utts = paddle.to_tensor(num_seen_utts)
|
||||||
|
# the default operator in all_reduce function is sum.
|
||||||
|
dist.all_reduce(num_seen_utts)
|
||||||
|
total_loss = paddle.to_tensor(total_loss)
|
||||||
|
dist.all_reduce(total_loss)
|
||||||
|
cv_loss = total_loss / num_seen_utts
|
||||||
|
cv_loss = float(cv_loss)
|
||||||
|
else:
|
||||||
|
cv_loss = total_loss / num_seen_utts
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
|
||||||
|
if self.visualizer:
|
||||||
|
self.visualizer.add_scalar(
|
||||||
|
tag='eval/cv_loss', value=cv_loss, step=self.epoch)
|
||||||
|
self.visualizer.add_scalar(
|
||||||
|
tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
|
||||||
|
|
||||||
|
self.save(tag=self.epoch, infos={'val_loss': cv_loss})
|
||||||
|
self.new_epoch()
|
||||||
|
|
||||||
|
def setup_dataloader(self):
|
||||||
|
config = self.config.clone()
|
||||||
|
self.use_streamdata = config.get("use_stream_data", False)
|
||||||
|
if self.train:
|
||||||
|
self.train_loader = DataLoaderFactory.get_dataloader(
|
||||||
|
'train', config, self.args)
|
||||||
|
self.valid_loader = DataLoaderFactory.get_dataloader(
|
||||||
|
'valid', config, self.args)
|
||||||
|
logger.info("Setup train/valid Dataloader!")
|
||||||
|
else:
|
||||||
|
decode_batch_size = config.get('decode', dict()).get(
|
||||||
|
'decode_batch_size', 1)
|
||||||
|
self.test_loader = DataLoaderFactory.get_dataloader('test', config,
|
||||||
|
self.args)
|
||||||
|
self.align_loader = DataLoaderFactory.get_dataloader(
|
||||||
|
'align', config, self.args)
|
||||||
|
logger.info("Setup test/align Dataloader!")
|
||||||
|
|
||||||
|
def setup_model(self):
|
||||||
|
config = self.config
|
||||||
|
model_conf = config
|
||||||
|
|
||||||
|
with UpdateConfig(model_conf):
|
||||||
|
if self.train:
|
||||||
|
model_conf.input_dim = self.train_loader.feat_dim
|
||||||
|
model_conf.output_dim = self.train_loader.vocab_size
|
||||||
|
else:
|
||||||
|
model_conf.input_dim = self.test_loader.feat_dim
|
||||||
|
model_conf.output_dim = self.test_loader.vocab_size
|
||||||
|
|
||||||
|
model = Wav2vec2ASR.from_config(model_conf)
|
||||||
|
|
||||||
|
if self.parallel:
|
||||||
|
model = paddle.DataParallel(model, find_unused_parameters=True)
|
||||||
|
|
||||||
|
logger.info(f"{model}")
|
||||||
|
layer_tools.print_params(model, logger.info)
|
||||||
|
self.model = model
|
||||||
|
logger.info("Setup model!")
|
||||||
|
|
||||||
|
# setup speech augmentation for wav2vec2
|
||||||
|
self.speech_augmentation = TimeDomainSpecAugment()
|
||||||
|
|
||||||
|
if not self.train:
|
||||||
|
return
|
||||||
|
|
||||||
|
train_config = config
|
||||||
|
optim_type = train_config.model_optim
|
||||||
|
optim_conf = train_config.model_optim_conf
|
||||||
|
scheduler_type = train_config.scheduler
|
||||||
|
scheduler_conf = train_config.scheduler_conf
|
||||||
|
|
||||||
|
scheduler_args = {
|
||||||
|
"learning_rate": optim_conf.lr,
|
||||||
|
"verbose": False,
|
||||||
|
"warmup_steps": scheduler_conf.warmup_steps,
|
||||||
|
"gamma": scheduler_conf.lr_decay,
|
||||||
|
"d_model": model_conf.dnn_neurons,
|
||||||
|
}
|
||||||
|
lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
|
||||||
|
scheduler_args)
|
||||||
|
|
||||||
|
def optimizer_args(
|
||||||
|
config,
|
||||||
|
parameters,
|
||||||
|
lr_scheduler=None, ):
|
||||||
|
train_config = config
|
||||||
|
optim_type = train_config.model_optim
|
||||||
|
optim_conf = train_config.model_optim_conf
|
||||||
|
scheduler_type = train_config.scheduler
|
||||||
|
scheduler_conf = train_config.scheduler_conf
|
||||||
|
return {
|
||||||
|
"grad_clip": train_config.global_grad_clip,
|
||||||
|
"learning_rate": lr_scheduler
|
||||||
|
if lr_scheduler else optim_conf.lr,
|
||||||
|
"epsilon": optim_conf.epsilon,
|
||||||
|
"rho": optim_conf.rho,
|
||||||
|
"parameters": parameters,
|
||||||
|
"beta1": 0.9 if optim_type == 'noam' else None,
|
||||||
|
"beat2": 0.98 if optim_type == 'noam' else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
|
||||||
|
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
|
||||||
|
|
||||||
|
self.optimizer = optimizer
|
||||||
|
self.lr_scheduler = lr_scheduler
|
||||||
|
logger.info("Setup optimizer/lr_scheduler!")
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
|
||||||
|
def __init__(self, config, args):
|
||||||
|
super().__init__(config, args)
|
||||||
|
self.text_featurizer = TextFeaturizer(
|
||||||
|
unit_type=config.unit_type, vocab=config.vocab_filepath)
|
||||||
|
self.vocab_list = self.text_featurizer.vocab_list
|
||||||
|
|
||||||
|
def id2token(self, texts, texts_len):
|
||||||
|
""" ord() id to chr() chr """
|
||||||
|
trans = []
|
||||||
|
for text, n in zip(texts, texts_len):
|
||||||
|
n = n.numpy().item()
|
||||||
|
ids = text[:n]
|
||||||
|
trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
|
||||||
|
return trans
|
||||||
|
|
||||||
|
def compute_metrics(self,
|
||||||
|
utts,
|
||||||
|
audio,
|
||||||
|
audio_len,
|
||||||
|
texts,
|
||||||
|
texts_len,
|
||||||
|
fout=None):
|
||||||
|
decode_cfg = self.config.decode
|
||||||
|
errors_sum, len_refs, num_ins = 0.0, 0, 0
|
||||||
|
errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
|
||||||
|
error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
target_transcripts = self.id2token(texts, texts_len)
|
||||||
|
result_transcripts, result_tokenids = self.model.decode(
|
||||||
|
audio,
|
||||||
|
text_feature=self.text_featurizer,
|
||||||
|
decoding_method=decode_cfg.decoding_method,
|
||||||
|
beam_size=decode_cfg.beam_size)
|
||||||
|
decode_time = time.time() - start_time
|
||||||
|
|
||||||
|
for utt, target, result, rec_tids in zip(
|
||||||
|
utts, target_transcripts, result_transcripts, result_tokenids):
|
||||||
|
errors, len_ref = errors_func(target, result)
|
||||||
|
errors_sum += errors
|
||||||
|
len_refs += len_ref
|
||||||
|
num_ins += 1
|
||||||
|
if fout:
|
||||||
|
fout.write({
|
||||||
|
"utt": utt,
|
||||||
|
"refs": [target],
|
||||||
|
"hyps": [result],
|
||||||
|
"hyps_tokenid": [rec_tids],
|
||||||
|
})
|
||||||
|
logger.info(f"Utt: {utt}")
|
||||||
|
logger.info(f"Ref: {target}")
|
||||||
|
logger.info(f"Hyp: {result}")
|
||||||
|
logger.info("One example error rate [%s] = %f" % (
|
||||||
|
decode_cfg.error_rate_type, error_rate_func(target, result)))
|
||||||
|
|
||||||
|
return dict(
|
||||||
|
errors_sum=errors_sum,
|
||||||
|
len_refs=len_refs,
|
||||||
|
num_ins=num_ins, # num examples
|
||||||
|
error_rate=errors_sum / len_refs,
|
||||||
|
error_rate_type=decode_cfg.error_rate_type,
|
||||||
|
num_frames=audio_len.sum().numpy().item(),
|
||||||
|
decode_time=decode_time)
|
||||||
|
|
||||||
|
@mp_tools.rank_zero_only
|
||||||
|
@paddle.no_grad()
|
||||||
|
def test(self):
|
||||||
|
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
error_rate_type = None
|
||||||
|
errors_sum, len_refs, num_ins = 0.0, 0, 0
|
||||||
|
num_frames = 0.0
|
||||||
|
num_time = 0.0
|
||||||
|
# Initialized the decoder in model
|
||||||
|
decode_cfg = self.config.decode
|
||||||
|
vocab_list = self.vocab_list
|
||||||
|
decode_batch_size = decode_cfg.decode_batch_size
|
||||||
|
|
||||||
|
with jsonlines.open(self.args.result_file, 'w') as fout:
|
||||||
|
for i, batch in enumerate(self.test_loader):
|
||||||
|
metrics = self.compute_metrics(*batch, fout=fout)
|
||||||
|
num_frames += metrics['num_frames']
|
||||||
|
num_time += metrics["decode_time"]
|
||||||
|
errors_sum += metrics['errors_sum']
|
||||||
|
len_refs += metrics['len_refs']
|
||||||
|
num_ins += metrics['num_ins']
|
||||||
|
error_rate_type = metrics['error_rate_type']
|
||||||
|
rtf = num_time / (num_frames)
|
||||||
|
logger.info(
|
||||||
|
"RTF: %f, Error rate [%s] (%d/?) = %f" %
|
||||||
|
(rtf, error_rate_type, num_ins, errors_sum / len_refs))
|
||||||
|
|
||||||
|
# logging
|
||||||
|
msg = "Test: "
|
||||||
|
msg += "epoch: {}, ".format(self.epoch)
|
||||||
|
msg += "step: {}, ".format(self.iteration)
|
||||||
|
msg += "Final error rate [%s] (%d/%d) = %f" % (
|
||||||
|
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
|
err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
|
||||||
|
err_type_str = "{}".format(error_rate_type)
|
||||||
|
with open(err_meta_path, 'w') as f:
|
||||||
|
data = json.dumps({
|
||||||
|
"epoch":
|
||||||
|
self.epoch,
|
||||||
|
"step":
|
||||||
|
self.iteration,
|
||||||
|
"rtf":
|
||||||
|
rtf,
|
||||||
|
error_rate_type:
|
||||||
|
errors_sum / len_refs,
|
||||||
|
"dataset_hour": (num_frames) / 1000.0 / 3600.0,
|
||||||
|
"process_hour":
|
||||||
|
num_time / 1000.0 / 3600.0,
|
||||||
|
"num_examples":
|
||||||
|
num_ins,
|
||||||
|
"err_sum":
|
||||||
|
errors_sum,
|
||||||
|
"ref_len":
|
||||||
|
len_refs,
|
||||||
|
"decode_method":
|
||||||
|
self.config.decode.decoding_method,
|
||||||
|
})
|
||||||
|
f.write(data + '\n')
|
@ -0,0 +1,44 @@
|
|||||||
|
"""Vanilla Neural Network for simple tests.
|
||||||
|
Authors
|
||||||
|
* Elena Rastorgueva 2020
|
||||||
|
"""
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.modules import containers
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.modules import linear
|
||||||
|
|
||||||
|
|
||||||
|
class VanillaNN(containers.Sequential):
|
||||||
|
"""A simple vanilla Deep Neural Network.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
activation : paddle class
|
||||||
|
A class used for constructing the activation layers.
|
||||||
|
dnn_blocks : int
|
||||||
|
The number of linear neural blocks to include.
|
||||||
|
dnn_neurons : int
|
||||||
|
The number of neurons in the linear layers.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> inputs = paddle.rand([10, 120, 60])
|
||||||
|
>>> model = VanillaNN(input_shape=inputs.shape)
|
||||||
|
>>> outputs = model(inputs)
|
||||||
|
>>> outputs.shape
|
||||||
|
paddle.shape([10, 120, 512])
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_shape,
|
||||||
|
activation=paddle.nn.LeakyReLU,
|
||||||
|
dnn_blocks=2,
|
||||||
|
dnn_neurons=512, ):
|
||||||
|
super().__init__(input_shape=input_shape)
|
||||||
|
|
||||||
|
for block_index in range(dnn_blocks):
|
||||||
|
self.append(
|
||||||
|
linear.Linear,
|
||||||
|
n_neurons=dnn_neurons,
|
||||||
|
bias=True,
|
||||||
|
layer_name="linear", )
|
||||||
|
self.append(activation(), layer_name="act")
|
@ -0,0 +1,180 @@
|
|||||||
|
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import math
|
||||||
|
|
||||||
|
from paddle import nn
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
from paddlespeech.s2t.utils.log import Log
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class NewGELUActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
|
||||||
|
the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return 0.5 * input * (1.0 + paddle.tanh(
|
||||||
|
math.sqrt(2.0 / math.pi) *
|
||||||
|
(input + 0.044715 * paddle.pow(input, 3.0))))
|
||||||
|
|
||||||
|
|
||||||
|
class GELUActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
|
||||||
|
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
|
||||||
|
paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))) This is now written in C in nn.functional
|
||||||
|
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, use_gelu_python: bool=False):
|
||||||
|
super().__init__()
|
||||||
|
self.act = nn.functional.gelu
|
||||||
|
|
||||||
|
def _gelu_python(self, input: Tensor) -> Tensor:
|
||||||
|
return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return self.act(input)
|
||||||
|
|
||||||
|
|
||||||
|
class FastGELUActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return 0.5 * input * (
|
||||||
|
1.0 + paddle.tanh(input * 0.7978845608 *
|
||||||
|
(1.0 + 0.044715 * input * input)))
|
||||||
|
|
||||||
|
|
||||||
|
class QuickGELUActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return input * paddle.sigmoid(1.702 * input)
|
||||||
|
|
||||||
|
|
||||||
|
class ClippedGELUActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
|
||||||
|
it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
|
||||||
|
https://arxiv.org/abs/2004.09602.
|
||||||
|
|
||||||
|
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
|
||||||
|
initially created.
|
||||||
|
|
||||||
|
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
|
||||||
|
paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, min: float, max: float):
|
||||||
|
if min > max:
|
||||||
|
raise ValueError(
|
||||||
|
f"min should be < max (got min: {min}, max: {max})")
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.min = min
|
||||||
|
self.max = max
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
return paddle.clip(gelu(x), self.min, self.max)
|
||||||
|
|
||||||
|
|
||||||
|
class SiLUActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
|
||||||
|
Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
|
||||||
|
Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
|
||||||
|
Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
|
||||||
|
later.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.act = nn.functional.silu
|
||||||
|
|
||||||
|
def _silu_python(self, input: Tensor) -> Tensor:
|
||||||
|
return input * paddle.sigmoid(input)
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return self.act(input)
|
||||||
|
|
||||||
|
|
||||||
|
class MishActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
|
||||||
|
visit the official repository for the paper: https://github.com/digantamisra98/Mish
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.act = nn.functional.mish
|
||||||
|
|
||||||
|
def _mish_python(self, input: Tensor) -> Tensor:
|
||||||
|
return input * paddle.tanh(nn.functional.softplus(input))
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return self.act(input)
|
||||||
|
|
||||||
|
|
||||||
|
class LinearActivation(nn.Layer):
|
||||||
|
"""
|
||||||
|
Applies the linear activation function, i.e. forwarding input directly to output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(self, input: Tensor) -> Tensor:
|
||||||
|
return input
|
||||||
|
|
||||||
|
|
||||||
|
ACT2FN = {
|
||||||
|
"gelu": GELUActivation(),
|
||||||
|
"gelu_10": ClippedGELUActivation(-10, 10),
|
||||||
|
"gelu_fast": FastGELUActivation(),
|
||||||
|
"gelu_new": NewGELUActivation(),
|
||||||
|
"gelu_python": GELUActivation(use_gelu_python=True),
|
||||||
|
"linear": LinearActivation(),
|
||||||
|
"mish": MishActivation(),
|
||||||
|
"quick_gelu": QuickGELUActivation(),
|
||||||
|
"relu": nn.ReLU(),
|
||||||
|
"sigmoid": nn.Sigmoid(),
|
||||||
|
"silu": SiLUActivation(),
|
||||||
|
"swish": SiLUActivation(),
|
||||||
|
"tanh": nn.Tanh(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_activation(activation_string):
|
||||||
|
if activation_string in ACT2FN:
|
||||||
|
return ACT2FN[activation_string]
|
||||||
|
else:
|
||||||
|
raise KeyError(
|
||||||
|
f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# For backwards compatibility with: from activations import gelu_python
|
||||||
|
gelu_python = get_activation("gelu_python")
|
||||||
|
gelu_new = get_activation("gelu_new")
|
||||||
|
gelu = get_activation("gelu")
|
||||||
|
gelu_fast = get_activation("gelu_fast")
|
||||||
|
quick_gelu = get_activation("quick_gelu")
|
||||||
|
silu = get_activation("silu")
|
||||||
|
mish = get_activation("mish")
|
||||||
|
linear_act = get_activation("linear")
|
@ -0,0 +1,129 @@
|
|||||||
|
import inspect
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
|
||||||
|
class Sequential(paddle.nn.LayerDict):
|
||||||
|
"""A sequence of modules with potentially inferring shape on construction.
|
||||||
|
If layers are passed with names, these can be referenced with dot notation.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
input_shape : iterable
|
||||||
|
A list or tuple of ints or None, representing the expected shape of an
|
||||||
|
input tensor. None represents a variable-length dimension. If no
|
||||||
|
``input_shape`` is passed, no shape inference will be performed.
|
||||||
|
*layers, **named_layers
|
||||||
|
The inputs are treated as a list of layers to be
|
||||||
|
applied in sequence. The output shape of each layer is used to
|
||||||
|
infer the shape of the following layer. If a tuple is returned,
|
||||||
|
only the shape of the first element is used to determine input
|
||||||
|
shape of the next layer (e.g. RNN returns output, hidden).
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> inputs = paddle.rand(10, 40, 50)
|
||||||
|
>>> model = Sequential(input_shape=inputs.shape)
|
||||||
|
>>> model.append(Linear, n_neurons=100, layer_name="layer1")
|
||||||
|
>>> model.append(Linear, n_neurons=200, layer_name="layer2")
|
||||||
|
>>> outputs = model(inputs)
|
||||||
|
>>> outputs.shape
|
||||||
|
paddle.shape([10, 40, 200])
|
||||||
|
>>> outputs = model.layer1(inputs)
|
||||||
|
>>> outputs.shape
|
||||||
|
paddle.shape([10, 40, 100])
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *layers, input_shape=None, **named_layers):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# Make sure either layers or input_shape is passed
|
||||||
|
if not layers and input_shape is None and not named_layers:
|
||||||
|
raise ValueError("Must pass either layers or input shape")
|
||||||
|
|
||||||
|
# Keep track of what layers need "lengths" passed
|
||||||
|
self.length_layers = []
|
||||||
|
|
||||||
|
# Replace None dimensions with arbitrary value
|
||||||
|
self.input_shape = input_shape
|
||||||
|
if input_shape and None in input_shape:
|
||||||
|
self.input_shape = list(input_shape)
|
||||||
|
for i, dim in enumerate(self.input_shape):
|
||||||
|
|
||||||
|
# To reduce size of dummy tensors, use 1 for batch dim
|
||||||
|
if i == 0 and dim is None:
|
||||||
|
dim = 1
|
||||||
|
|
||||||
|
# Use 64 as nice round arbitrary value, big enough that
|
||||||
|
# halving this dimension a few times doesn't reach 1
|
||||||
|
self.input_shape[i] = dim or 256
|
||||||
|
|
||||||
|
# Append non-named layers
|
||||||
|
for layer in layers:
|
||||||
|
self.append(layer)
|
||||||
|
|
||||||
|
# Append named layers
|
||||||
|
for name, layer in named_layers.items():
|
||||||
|
self.append(layer, layer_name=name)
|
||||||
|
|
||||||
|
def append(self, layer, *args, layer_name=None, **kwargs):
|
||||||
|
"""Add a layer to the list of layers, inferring shape if necessary.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
layer : A paddle.nn.Module class or object
|
||||||
|
If the layer is a class, it should accept an argument called
|
||||||
|
``input_shape`` which will be inferred and passed. If the layer
|
||||||
|
is a module object, it is added as-is.
|
||||||
|
layer_name : str
|
||||||
|
The name of the layer, for reference. If the name is in use,
|
||||||
|
``_{count}`` will be appended.
|
||||||
|
*args, **kwargs
|
||||||
|
These are passed to the layer if it is constructed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Compute layer_name
|
||||||
|
if layer_name is None:
|
||||||
|
layer_name = str(len(self))
|
||||||
|
elif layer_name in self:
|
||||||
|
index = 0
|
||||||
|
while f"{layer_name}_{index}" in self:
|
||||||
|
index += 1
|
||||||
|
layer_name = f"{layer_name}_{index}"
|
||||||
|
# Check if it needs to be constructed with input shape
|
||||||
|
if self.input_shape:
|
||||||
|
argspec = inspect.getfullargspec(layer)
|
||||||
|
if "input_shape" in argspec.args + argspec.kwonlyargs:
|
||||||
|
input_shape = self.get_output_shape()
|
||||||
|
layer = layer(*args, input_shape=input_shape, **kwargs)
|
||||||
|
|
||||||
|
# Finally, append the layer.
|
||||||
|
try:
|
||||||
|
self[layer_name] = layer
|
||||||
|
# self.add_module(layer_name, layer)
|
||||||
|
except TypeError:
|
||||||
|
raise ValueError(
|
||||||
|
"Must pass `input_shape` at initialization and use "
|
||||||
|
"modules that take `input_shape` to infer shape when "
|
||||||
|
"using `append()`.")
|
||||||
|
|
||||||
|
def get_output_shape(self):
|
||||||
|
"""Returns expected shape of the output.
|
||||||
|
Computed by passing dummy input constructed with the
|
||||||
|
``self.input_shape`` attribute.
|
||||||
|
"""
|
||||||
|
with paddle.no_grad():
|
||||||
|
dummy_input = paddle.zeros(self.input_shape)
|
||||||
|
dummy_output = self(dummy_input)
|
||||||
|
return dummy_output.shape
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Applies layers in sequence, passing only the first element of tuples.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
x : paddle.Tensor
|
||||||
|
The input tensor to run through the network.
|
||||||
|
"""
|
||||||
|
for layer in self.values():
|
||||||
|
x = layer(x)
|
||||||
|
if isinstance(x, tuple):
|
||||||
|
x = x[0]
|
||||||
|
|
||||||
|
return x
|
@ -0,0 +1,72 @@
|
|||||||
|
"""Library implementing linear transformation.
|
||||||
|
Authors
|
||||||
|
* Mirco Ravanelli 2020
|
||||||
|
* Davide Borra 2021
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlespeech.s2t.modules import align
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Linear(paddle.nn.Layer):
|
||||||
|
"""Computes a linear transformation y = wx + b.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
n_neurons : int
|
||||||
|
It is the number of output neurons (i.e, the dimensionality of the
|
||||||
|
output).
|
||||||
|
input_shape: tuple
|
||||||
|
It is the shape of the input tensor.
|
||||||
|
input_size: int
|
||||||
|
Size of the input tensor.
|
||||||
|
bias : bool
|
||||||
|
If True, the additive bias b is adopted.
|
||||||
|
combine_dims : bool
|
||||||
|
If True and the input is 4D, combine 3rd and 4th dimensions of input.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> inputs = paddle.rand(10, 50, 40)
|
||||||
|
>>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
|
||||||
|
>>> output = lin_t(inputs)
|
||||||
|
>>> output.shape
|
||||||
|
paddle.shape([10, 50, 100])
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
n_neurons,
|
||||||
|
input_shape=None,
|
||||||
|
input_size=None,
|
||||||
|
bias=True,
|
||||||
|
combine_dims=False, ):
|
||||||
|
super().__init__()
|
||||||
|
self.combine_dims = combine_dims
|
||||||
|
|
||||||
|
if input_shape is None and input_size is None:
|
||||||
|
raise ValueError("Expected one of input_shape or input_size")
|
||||||
|
|
||||||
|
if input_size is None:
|
||||||
|
input_size = input_shape[-1]
|
||||||
|
if len(input_shape) == 4 and self.combine_dims:
|
||||||
|
input_size = input_shape[2] * input_shape[3]
|
||||||
|
|
||||||
|
# Weights are initialized following paddle approach
|
||||||
|
self.w = align.Linear(input_size, n_neurons, bias_attr=bias)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Returns the linear transformation of input tensor.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input to transform linearly.
|
||||||
|
"""
|
||||||
|
if x.rank == 4 and self.combine_dims:
|
||||||
|
x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
|
||||||
|
|
||||||
|
wx = self.w(x)
|
||||||
|
|
||||||
|
return wx
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,236 @@
|
|||||||
|
"""
|
||||||
|
Low level signal processing utilities
|
||||||
|
Authors
|
||||||
|
* Peter Plantinga 2020
|
||||||
|
* Francois Grondin 2020
|
||||||
|
* William Aris 2020
|
||||||
|
* Samuele Cornell 2020
|
||||||
|
* Sarthak Yadav 2022
|
||||||
|
"""
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
|
||||||
|
def blackman_window(window_length, periodic=True):
|
||||||
|
"""Blackman window function.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
window_length : int
|
||||||
|
Controlling the returned window size.
|
||||||
|
periodic : bool
|
||||||
|
Determines whether the returned window trims off the
|
||||||
|
last duplicate value from the symmetric window
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
A 1-D tensor of size (window_length) containing the window
|
||||||
|
"""
|
||||||
|
if window_length == 0:
|
||||||
|
return []
|
||||||
|
if window_length == 1:
|
||||||
|
return paddle.ones([1])
|
||||||
|
if periodic:
|
||||||
|
window_length += 1
|
||||||
|
window = paddle.arange(window_length) * (np.pi / (window_length - 1))
|
||||||
|
window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
|
||||||
|
return window[:-1] if periodic else window
|
||||||
|
|
||||||
|
|
||||||
|
def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
|
||||||
|
"""Compute amplitude of a batch of waveforms.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveform : tensor
|
||||||
|
The waveforms used for computing amplitude.
|
||||||
|
Shape should be `[time]` or `[batch, time]` or
|
||||||
|
`[batch, time, channels]`.
|
||||||
|
lengths : tensor
|
||||||
|
The lengths of the waveforms excluding the padding.
|
||||||
|
Shape should be a single dimension, `[batch]`.
|
||||||
|
amp_type : str
|
||||||
|
Whether to compute "avg" average or "peak" amplitude.
|
||||||
|
Choose between ["avg", "peak"].
|
||||||
|
scale : str
|
||||||
|
Whether to compute amplitude in "dB" or "linear" scale.
|
||||||
|
Choose between ["linear", "dB"].
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
The average amplitude of the waveforms.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> signal = paddle.sin(paddle.arange(16000.0)).unsqueeze(0)
|
||||||
|
>>> compute_amplitude(signal, signal.size(1))
|
||||||
|
tensor([[0.6366]])
|
||||||
|
"""
|
||||||
|
if len(waveforms.shape) == 1:
|
||||||
|
waveforms = waveforms.unsqueeze(0)
|
||||||
|
|
||||||
|
assert amp_type in ["avg", "peak"]
|
||||||
|
assert scale in ["linear", "dB"]
|
||||||
|
|
||||||
|
if amp_type == "avg":
|
||||||
|
if lengths is None:
|
||||||
|
out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
|
||||||
|
else:
|
||||||
|
wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
|
||||||
|
out = wav_sum / lengths
|
||||||
|
elif amp_type == "peak":
|
||||||
|
out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
if scale == "linear":
|
||||||
|
return out
|
||||||
|
elif scale == "dB":
|
||||||
|
return paddle.clip(20 * paddle.log10(out), min=-80) # clamp zeros
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def convolve1d(
|
||||||
|
waveform,
|
||||||
|
kernel,
|
||||||
|
padding=0,
|
||||||
|
pad_type="constant",
|
||||||
|
stride=1,
|
||||||
|
groups=1,
|
||||||
|
use_fft=False,
|
||||||
|
rotation_index=0, ):
|
||||||
|
"""Use paddle.nn.functional to perform 1d padding and conv.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveform : tensor
|
||||||
|
The tensor to perform operations on.
|
||||||
|
kernel : tensor
|
||||||
|
The filter to apply during convolution.
|
||||||
|
padding : int or tuple
|
||||||
|
The padding (pad_left, pad_right) to apply.
|
||||||
|
If an integer is passed instead, this is passed
|
||||||
|
to the conv1d function and pad_type is ignored.
|
||||||
|
pad_type : str
|
||||||
|
The type of padding to use. Passed directly to
|
||||||
|
`paddle.nn.functional.pad`, see Paddle documentation
|
||||||
|
for available options.
|
||||||
|
stride : int
|
||||||
|
The number of units to move each time convolution is applied.
|
||||||
|
Passed to conv1d. Has no effect if `use_fft` is True.
|
||||||
|
groups : int
|
||||||
|
This option is passed to `conv1d` to split the input into groups for
|
||||||
|
convolution. Input channels should be divisible by the number of groups.
|
||||||
|
use_fft : bool
|
||||||
|
When `use_fft` is passed `True`, then compute the convolution in the
|
||||||
|
spectral domain using complex multiply. This is more efficient on CPU
|
||||||
|
when the size of the kernel is large (e.g. reverberation). WARNING:
|
||||||
|
Without padding, circular convolution occurs. This makes little
|
||||||
|
difference in the case of reverberation, but may make more difference
|
||||||
|
with different kernels.
|
||||||
|
rotation_index : int
|
||||||
|
This option only applies if `use_fft` is true. If so, the kernel is
|
||||||
|
rolled by this amount before convolution to shift the output location.
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
The convolved waveform.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> from speechbrain.dataio.dataio import read_audio
|
||||||
|
>>> signal = read_audio('tests/samples/single-mic/example1.wav')
|
||||||
|
>>> signal = signal.unsqueeze(0).unsqueeze(2)
|
||||||
|
>>> kernel = paddle.rand([1, 10, 1])
|
||||||
|
>>> signal = convolve1d(signal, kernel, padding=(9, 0))
|
||||||
|
"""
|
||||||
|
if len(waveform.shape) != 3:
|
||||||
|
raise ValueError("Convolve1D expects a 3-dimensional tensor")
|
||||||
|
|
||||||
|
# Move time dimension last, which pad and fft and conv expect.
|
||||||
|
waveform = waveform.transpose([0, 2, 1])
|
||||||
|
kernel = kernel.transpose([0, 2, 1])
|
||||||
|
# Padding can be a tuple (left_pad, right_pad) or an int
|
||||||
|
if isinstance(padding, tuple):
|
||||||
|
waveform = paddle.nn.functional.pad(
|
||||||
|
x=waveform, pad=padding, mode=pad_type, data_format='NCL')
|
||||||
|
|
||||||
|
# This approach uses FFT, which is more efficient if the kernel is large
|
||||||
|
if use_fft:
|
||||||
|
# Pad kernel to same length as signal, ensuring correct alignment
|
||||||
|
zero_length = waveform.shape[-1] - kernel.shape[-1]
|
||||||
|
|
||||||
|
# Handle case where signal is shorter
|
||||||
|
if zero_length < 0:
|
||||||
|
kernel = kernel[..., :zero_length]
|
||||||
|
zero_length = 0
|
||||||
|
|
||||||
|
# Perform rotation to ensure alignment
|
||||||
|
zeros = paddle.zeros(
|
||||||
|
[kernel.shape[0], kernel.shape[1], zero_length], dtype=kernel.dtype)
|
||||||
|
after_index = kernel[..., rotation_index:]
|
||||||
|
before_index = kernel[..., :rotation_index]
|
||||||
|
kernel = paddle.concat((after_index, zeros, before_index), axis=-1)
|
||||||
|
|
||||||
|
# Multiply in frequency domain to convolve in time domain
|
||||||
|
import paddle.fft as fft
|
||||||
|
|
||||||
|
result = fft.rfft(waveform) * fft.rfft(kernel)
|
||||||
|
convolved = fft.irfft(result, n=waveform.shape[-1])
|
||||||
|
|
||||||
|
# Use the implementation given by paddle, which should be efficient on GPU
|
||||||
|
else:
|
||||||
|
convolved = paddle.nn.functional.conv1d(
|
||||||
|
x=waveform,
|
||||||
|
weight=kernel,
|
||||||
|
stride=stride,
|
||||||
|
groups=groups,
|
||||||
|
padding=padding if not isinstance(padding, tuple) else 0, )
|
||||||
|
|
||||||
|
# Return time dimension to the second dimension.
|
||||||
|
return convolved.transpose([0, 2, 1])
|
||||||
|
|
||||||
|
|
||||||
|
def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
|
||||||
|
"""Returns a notch filter constructed from a high-pass and low-pass filter.
|
||||||
|
(from https://tomroelandts.com/articles/
|
||||||
|
how-to-create-simple-band-pass-and-band-reject-filters)
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
notch_freq : float
|
||||||
|
frequency to put notch as a fraction of the
|
||||||
|
sampling rate / 2. The range of possible inputs is 0 to 1.
|
||||||
|
filter_width : int
|
||||||
|
Filter width in samples. Longer filters have
|
||||||
|
smaller transition bands, but are more inefficient.
|
||||||
|
notch_width : float
|
||||||
|
Width of the notch, as a fraction of the sampling_rate / 2.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Check inputs
|
||||||
|
assert 0 < notch_freq <= 1
|
||||||
|
assert filter_width % 2 != 0
|
||||||
|
pad = filter_width // 2
|
||||||
|
inputs = paddle.arange(filter_width) - pad
|
||||||
|
|
||||||
|
# Avoid frequencies that are too low
|
||||||
|
notch_freq += notch_width
|
||||||
|
|
||||||
|
# Define sinc function, avoiding division by zero
|
||||||
|
def sinc(x):
|
||||||
|
"Computes the sinc function."
|
||||||
|
|
||||||
|
def _sinc(x):
|
||||||
|
return paddle.sin(x) / x
|
||||||
|
|
||||||
|
# The zero is at the middle index
|
||||||
|
return paddle.concat(
|
||||||
|
[_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1:])])
|
||||||
|
|
||||||
|
# Compute a low-pass filter with cutoff frequency notch_freq.
|
||||||
|
hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
|
||||||
|
hlpf *= blackman_window(filter_width)
|
||||||
|
hlpf /= paddle.sum(hlpf)
|
||||||
|
|
||||||
|
# Compute a high-pass filter with cutoff frequency notch_freq.
|
||||||
|
hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
|
||||||
|
hhpf *= blackman_window(filter_width)
|
||||||
|
hhpf /= -paddle.sum(hhpf)
|
||||||
|
hhpf[pad] += 1
|
||||||
|
|
||||||
|
# Adding filters creates notch filter
|
||||||
|
return (hlpf + hhpf).view(1, -1, 1)
|
@ -0,0 +1,716 @@
|
|||||||
|
import math
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.nn as nn
|
||||||
|
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.processing.signal_processing import compute_amplitude
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.processing.signal_processing import convolve1d
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.processing.signal_processing import notch_filter
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedPerturb(nn.Layer):
|
||||||
|
"""Slightly speed up or slow down an audio signal.
|
||||||
|
Resample the audio signal at a rate that is similar to the original rate,
|
||||||
|
to achieve a slightly slower or slightly faster signal. This technique is
|
||||||
|
outlined in the paper: "Audio Augmentation for Speech Recognition"
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
orig_freq : int
|
||||||
|
The frequency of the original signal.
|
||||||
|
speeds : list
|
||||||
|
The speeds that the signal should be changed to, as a percentage of the
|
||||||
|
original signal (i.e. `speeds` is divided by 100 to get a ratio).
|
||||||
|
perturb_prob : float
|
||||||
|
The chance that the batch will be speed-
|
||||||
|
perturbed. By default, every batch is perturbed.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> from speechbrain.dataio.dataio import read_audio
|
||||||
|
>>> signal = read_audio('tests/samples/single-mic/example1.wav')
|
||||||
|
>>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90])
|
||||||
|
>>> clean = signal.unsqueeze(0)
|
||||||
|
>>> perturbed = perturbator(clean)
|
||||||
|
>>> clean.shape
|
||||||
|
paddle.shape([1, 52173])
|
||||||
|
>>> perturbed.shape
|
||||||
|
paddle.shape([1, 46956])
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
orig_freq,
|
||||||
|
speeds=[90, 100, 110],
|
||||||
|
perturb_prob=1.0, ):
|
||||||
|
super().__init__()
|
||||||
|
self.orig_freq = orig_freq
|
||||||
|
self.speeds = speeds
|
||||||
|
self.perturb_prob = perturb_prob
|
||||||
|
|
||||||
|
# Initialize index of perturbation
|
||||||
|
self.samp_index = 0
|
||||||
|
|
||||||
|
# Initialize resamplers
|
||||||
|
self.resamplers = []
|
||||||
|
for speed in self.speeds:
|
||||||
|
config = {
|
||||||
|
"orig_freq": self.orig_freq,
|
||||||
|
"new_freq": self.orig_freq * speed // 100,
|
||||||
|
}
|
||||||
|
self.resamplers.append(Resample(**config))
|
||||||
|
|
||||||
|
def forward(self, waveform):
|
||||||
|
"""
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveforms : tensor
|
||||||
|
Shape should be `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
lengths : tensor
|
||||||
|
Shape should be a single dimension, `[batch]`.
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor of shape `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Don't perturb (return early) 1-`perturb_prob` portion of the batches
|
||||||
|
if paddle.rand([1]) > self.perturb_prob:
|
||||||
|
|
||||||
|
return waveform.clone()
|
||||||
|
# Perform a random perturbation
|
||||||
|
self.samp_index = paddle.randint(len(self.speeds), shape=(1, ))[0]
|
||||||
|
perturbed_waveform = self.resamplers[self.samp_index](waveform)
|
||||||
|
|
||||||
|
return perturbed_waveform
|
||||||
|
|
||||||
|
|
||||||
|
class Resample(nn.Layer):
|
||||||
|
"""This class resamples an audio signal using sinc-based interpolation.
|
||||||
|
|
||||||
|
It is a modification of the `resample` function from torchaudio
|
||||||
|
(https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html)
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
orig_freq : int
|
||||||
|
the sampling frequency of the input signal.
|
||||||
|
new_freq : int
|
||||||
|
the new sampling frequency after this operation is performed.
|
||||||
|
lowpass_filter_width : int
|
||||||
|
Controls the sharpness of the filter, larger numbers result in a
|
||||||
|
sharper filter, but they are less efficient. Values from 4 to 10 are allowed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
orig_freq=16000,
|
||||||
|
new_freq=16000,
|
||||||
|
lowpass_filter_width=6, ):
|
||||||
|
super().__init__()
|
||||||
|
self.orig_freq = orig_freq
|
||||||
|
self.new_freq = new_freq
|
||||||
|
self.lowpass_filter_width = lowpass_filter_width
|
||||||
|
|
||||||
|
# Compute rate for striding
|
||||||
|
self._compute_strides()
|
||||||
|
assert self.orig_freq % self.conv_stride == 0
|
||||||
|
assert self.new_freq % self.conv_transpose_stride == 0
|
||||||
|
|
||||||
|
def _compute_strides(self):
|
||||||
|
"""Compute the phases in polyphase filter.
|
||||||
|
|
||||||
|
(almost directly from torchaudio.compliance.kaldi)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Compute new unit based on ratio of in/out frequencies
|
||||||
|
base_freq = math.gcd(self.orig_freq, self.new_freq)
|
||||||
|
input_samples_in_unit = self.orig_freq // base_freq
|
||||||
|
self.output_samples = self.new_freq // base_freq
|
||||||
|
|
||||||
|
# Store the appropriate stride based on the new units
|
||||||
|
self.conv_stride = input_samples_in_unit
|
||||||
|
self.conv_transpose_stride = self.output_samples
|
||||||
|
|
||||||
|
def forward(self, waveforms):
|
||||||
|
"""
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveforms : tensor
|
||||||
|
Shape should be `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
lengths : tensor
|
||||||
|
Shape should be a single dimension, `[batch]`.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor of shape `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not hasattr(self, "first_indices"):
|
||||||
|
self._indices_and_weights(waveforms)
|
||||||
|
|
||||||
|
# Don't do anything if the frequencies are the same
|
||||||
|
if self.orig_freq == self.new_freq:
|
||||||
|
return waveforms
|
||||||
|
unsqueezed = False
|
||||||
|
if len(waveforms.shape) == 2:
|
||||||
|
waveforms = waveforms.unsqueeze(1)
|
||||||
|
unsqueezed = True
|
||||||
|
elif len(waveforms.shape) == 3:
|
||||||
|
waveforms = waveforms.transpose([0, 2, 1])
|
||||||
|
else:
|
||||||
|
raise ValueError("Input must be 2 or 3 dimensions")
|
||||||
|
|
||||||
|
# Do resampling
|
||||||
|
resampled_waveform = self._perform_resample(waveforms)
|
||||||
|
|
||||||
|
if unsqueezed:
|
||||||
|
resampled_waveform = resampled_waveform.squeeze(1)
|
||||||
|
else:
|
||||||
|
resampled_waveform = resampled_waveform.transpose([0, 2, 1])
|
||||||
|
|
||||||
|
return resampled_waveform
|
||||||
|
|
||||||
|
def _perform_resample(self, waveforms):
|
||||||
|
"""Resamples the waveform at the new frequency.
|
||||||
|
|
||||||
|
This matches Kaldi's OfflineFeatureTpl ResampleWaveform which uses a
|
||||||
|
LinearResample (resample a signal at linearly spaced intervals to
|
||||||
|
up/downsample a signal). LinearResample (LR) means that the output
|
||||||
|
signal is at linearly spaced intervals (i.e the output signal has a
|
||||||
|
frequency of `new_freq`). It uses sinc/bandlimited interpolation to
|
||||||
|
upsample/downsample the signal.
|
||||||
|
|
||||||
|
(almost directly from torchaudio.compliance.kaldi)
|
||||||
|
|
||||||
|
https://ccrma.stanford.edu/~jos/resample/
|
||||||
|
Theory_Ideal_Bandlimited_Interpolation.html
|
||||||
|
|
||||||
|
https://github.com/kaldi-asr/kaldi/blob/master/src/feat/resample.h#L56
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveforms : tensor
|
||||||
|
The batch of audio signals to resample.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
The waveforms at the new frequency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Compute output size and initialize
|
||||||
|
batch_size, num_channels, wave_len = waveforms.shape
|
||||||
|
window_size = self.weights.shape[1]
|
||||||
|
tot_output_samp = self._output_samples(wave_len)
|
||||||
|
resampled_waveform = paddle.zeros(
|
||||||
|
(batch_size, num_channels, tot_output_samp))
|
||||||
|
# self.weights = self.weights.to(waveforms.device)
|
||||||
|
|
||||||
|
# Check weights are on correct device
|
||||||
|
# if waveforms.device != self.weights.device:
|
||||||
|
# self.weights = self.weights.to(waveforms.device)
|
||||||
|
|
||||||
|
# eye size: (num_channels, num_channels, 1)
|
||||||
|
eye = paddle.eye(num_channels).unsqueeze(2)
|
||||||
|
|
||||||
|
# Iterate over the phases in the polyphase filter
|
||||||
|
for i in range(self.first_indices.shape[0]):
|
||||||
|
wave_to_conv = waveforms
|
||||||
|
first_index = int(self.first_indices[i].item())
|
||||||
|
if first_index >= 0:
|
||||||
|
# trim the signal as the filter will not be applied
|
||||||
|
# before the first_index
|
||||||
|
wave_to_conv = wave_to_conv[..., first_index:]
|
||||||
|
|
||||||
|
# pad the right of the signal to allow partial convolutions
|
||||||
|
# meaning compute values for partial windows (e.g. end of the
|
||||||
|
# window is outside the signal length)
|
||||||
|
max_index = (tot_output_samp - 1) // self.output_samples
|
||||||
|
end_index = max_index * self.conv_stride + window_size
|
||||||
|
current_wave_len = wave_len - first_index
|
||||||
|
right_padding = max(0, end_index + 1 - current_wave_len)
|
||||||
|
left_padding = max(0, -first_index)
|
||||||
|
wave_to_conv = paddle.nn.functional.pad(
|
||||||
|
wave_to_conv, (left_padding, right_padding), data_format='NCL')
|
||||||
|
conv_wave = paddle.nn.functional.conv1d(
|
||||||
|
x=wave_to_conv,
|
||||||
|
weight=self.weights[i].repeat(num_channels, 1, 1),
|
||||||
|
stride=self.conv_stride,
|
||||||
|
groups=num_channels, )
|
||||||
|
|
||||||
|
# we want conv_wave[:, i] to be at
|
||||||
|
# output[:, i + n*conv_transpose_stride]
|
||||||
|
dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
|
||||||
|
conv_wave, eye, stride=self.conv_transpose_stride)
|
||||||
|
|
||||||
|
# pad dilated_conv_wave so it reaches the output length if needed.
|
||||||
|
left_padding = i
|
||||||
|
previous_padding = left_padding + dilated_conv_wave.shape[-1]
|
||||||
|
right_padding = max(0, tot_output_samp - previous_padding)
|
||||||
|
dilated_conv_wave = paddle.nn.functional.pad(
|
||||||
|
dilated_conv_wave, (left_padding, right_padding),
|
||||||
|
data_format='NCL')
|
||||||
|
dilated_conv_wave = dilated_conv_wave[..., :tot_output_samp]
|
||||||
|
|
||||||
|
resampled_waveform += dilated_conv_wave
|
||||||
|
|
||||||
|
return resampled_waveform
|
||||||
|
|
||||||
|
def _output_samples(self, input_num_samp):
|
||||||
|
"""Based on LinearResample::GetNumOutputSamples.
|
||||||
|
|
||||||
|
LinearResample (LR) means that the output signal is at
|
||||||
|
linearly spaced intervals (i.e the output signal has a
|
||||||
|
frequency of ``new_freq``). It uses sinc/bandlimited
|
||||||
|
interpolation to upsample/downsample the signal.
|
||||||
|
|
||||||
|
(almost directly from torchaudio.compliance.kaldi)
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
input_num_samp : int
|
||||||
|
The number of samples in each example in the batch.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Number of samples in the output waveform.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
|
||||||
|
# where tick_freq is the least common multiple of samp_in and
|
||||||
|
# samp_out.
|
||||||
|
samp_in = int(self.orig_freq)
|
||||||
|
samp_out = int(self.new_freq)
|
||||||
|
|
||||||
|
tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
|
||||||
|
ticks_per_input_period = tick_freq // samp_in
|
||||||
|
|
||||||
|
# work out the number of ticks in the time interval
|
||||||
|
# [ 0, input_num_samp/samp_in ).
|
||||||
|
interval_length = input_num_samp * ticks_per_input_period
|
||||||
|
if interval_length <= 0:
|
||||||
|
return 0
|
||||||
|
ticks_per_output_period = tick_freq // samp_out
|
||||||
|
|
||||||
|
# Get the last output-sample in the closed interval,
|
||||||
|
# i.e. replacing [ ) with [ ]. Note: integer division rounds down.
|
||||||
|
# See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
|
||||||
|
# explanation of the notation.
|
||||||
|
last_output_samp = interval_length // ticks_per_output_period
|
||||||
|
|
||||||
|
# We need the last output-sample in the open interval, so if it
|
||||||
|
# takes us to the end of the interval exactly, subtract one.
|
||||||
|
if last_output_samp * ticks_per_output_period == interval_length:
|
||||||
|
last_output_samp -= 1
|
||||||
|
|
||||||
|
# First output-sample index is zero, so the number of output samples
|
||||||
|
# is the last output-sample plus one.
|
||||||
|
num_output_samp = last_output_samp + 1
|
||||||
|
|
||||||
|
return num_output_samp
|
||||||
|
|
||||||
|
def _indices_and_weights(self, waveforms):
|
||||||
|
"""Based on LinearResample::SetIndexesAndWeights
|
||||||
|
|
||||||
|
Retrieves the weights for resampling as well as the indices in which
|
||||||
|
they are valid. LinearResample (LR) means that the output signal is at
|
||||||
|
linearly spaced intervals (i.e the output signal has a frequency
|
||||||
|
of ``new_freq``). It uses sinc/bandlimited interpolation to
|
||||||
|
upsample/downsample the signal.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
- the place where each filter should start being applied
|
||||||
|
- the filters to be applied to the signal for resampling
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Lowpass filter frequency depends on smaller of two frequencies
|
||||||
|
min_freq = min(self.orig_freq, self.new_freq)
|
||||||
|
lowpass_cutoff = 0.99 * 0.5 * min_freq
|
||||||
|
|
||||||
|
assert lowpass_cutoff * 2 <= min_freq
|
||||||
|
window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
|
||||||
|
|
||||||
|
assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
|
||||||
|
output_t = paddle.arange(start=0.0, end=self.output_samples)
|
||||||
|
output_t /= self.new_freq
|
||||||
|
min_t = output_t - window_width
|
||||||
|
max_t = output_t + window_width
|
||||||
|
|
||||||
|
min_input_index = paddle.ceil(min_t * self.orig_freq)
|
||||||
|
max_input_index = paddle.floor(max_t * self.orig_freq)
|
||||||
|
num_indices = max_input_index - min_input_index + 1
|
||||||
|
|
||||||
|
max_weight_width = num_indices.max()
|
||||||
|
j = paddle.arange(max_weight_width)
|
||||||
|
input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
|
||||||
|
delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
|
||||||
|
|
||||||
|
weights = paddle.zeros_like(delta_t)
|
||||||
|
|
||||||
|
inside_window_indices = delta_t.abs() < (window_width)
|
||||||
|
# raised-cosine (Hanning) window with width `window_width`
|
||||||
|
weights[inside_window_indices] = 0.5 * (1 + paddle.cos(
|
||||||
|
2 * math.pi * lowpass_cutoff / self.lowpass_filter_width *
|
||||||
|
delta_t[inside_window_indices]))
|
||||||
|
t_eq_zero_indices = delta_t == 0.0
|
||||||
|
t_not_eq_zero_indices = ~t_eq_zero_indices
|
||||||
|
|
||||||
|
# sinc filter function
|
||||||
|
weights[t_not_eq_zero_indices] *= paddle.sin(
|
||||||
|
2 * math.pi * lowpass_cutoff * delta_t[t_not_eq_zero_indices]) / (
|
||||||
|
math.pi * delta_t[t_not_eq_zero_indices])
|
||||||
|
|
||||||
|
# limit of the function at t = 0
|
||||||
|
weights[t_eq_zero_indices] *= 2 * lowpass_cutoff
|
||||||
|
|
||||||
|
# size (output_samples, max_weight_width)
|
||||||
|
weights /= self.orig_freq
|
||||||
|
|
||||||
|
self.first_indices = min_input_index
|
||||||
|
self.weights = weights
|
||||||
|
|
||||||
|
|
||||||
|
class DropFreq(nn.Layer):
|
||||||
|
"""This class drops a random frequency from the signal.
|
||||||
|
The purpose of this class is to teach models to learn to rely on all parts
|
||||||
|
of the signal, not just a few frequency bands.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
drop_freq_low : float
|
||||||
|
The low end of frequencies that can be dropped,
|
||||||
|
as a fraction of the sampling rate / 2.
|
||||||
|
drop_freq_high : float
|
||||||
|
The high end of frequencies that can be
|
||||||
|
dropped, as a fraction of the sampling rate / 2.
|
||||||
|
drop_count_low : int
|
||||||
|
The low end of number of frequencies that could be dropped.
|
||||||
|
drop_count_high : int
|
||||||
|
The high end of number of frequencies that could be dropped.
|
||||||
|
drop_width : float
|
||||||
|
The width of the frequency band to drop, as
|
||||||
|
a fraction of the sampling_rate / 2.
|
||||||
|
drop_prob : float
|
||||||
|
The probability that the batch of signals will have a frequency
|
||||||
|
dropped. By default, every batch has frequencies dropped.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> from speechbrain.dataio.dataio import read_audio
|
||||||
|
>>> dropper = DropFreq()
|
||||||
|
>>> signal = read_audio('tests/samples/single-mic/example1.wav')
|
||||||
|
>>> dropped_signal = dropper(signal.unsqueeze(0))
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
drop_freq_low=1e-14,
|
||||||
|
drop_freq_high=1,
|
||||||
|
drop_count_low=1,
|
||||||
|
drop_count_high=2,
|
||||||
|
drop_width=0.05,
|
||||||
|
drop_prob=1, ):
|
||||||
|
super().__init__()
|
||||||
|
self.drop_freq_low = drop_freq_low
|
||||||
|
self.drop_freq_high = drop_freq_high
|
||||||
|
self.drop_count_low = drop_count_low
|
||||||
|
self.drop_count_high = drop_count_high
|
||||||
|
self.drop_width = drop_width
|
||||||
|
self.drop_prob = drop_prob
|
||||||
|
|
||||||
|
def forward(self, waveforms):
|
||||||
|
"""
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveforms : tensor
|
||||||
|
Shape should be `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor of shape `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Don't drop (return early) 1-`drop_prob` portion of the batches
|
||||||
|
dropped_waveform = waveforms.clone()
|
||||||
|
if paddle.rand([1]) > self.drop_prob:
|
||||||
|
return dropped_waveform
|
||||||
|
|
||||||
|
# Add channels dimension
|
||||||
|
if len(waveforms.shape) == 2:
|
||||||
|
dropped_waveform = dropped_waveform.unsqueeze(-1)
|
||||||
|
|
||||||
|
# Pick number of frequencies to drop
|
||||||
|
drop_count = paddle.randint(
|
||||||
|
low=self.drop_count_low,
|
||||||
|
high=self.drop_count_high + 1,
|
||||||
|
shape=(1, ), )
|
||||||
|
|
||||||
|
# Pick a frequency to drop
|
||||||
|
drop_range = self.drop_freq_high - self.drop_freq_low
|
||||||
|
drop_frequency = (
|
||||||
|
paddle.rand(drop_count) * drop_range + self.drop_freq_low)
|
||||||
|
# Filter parameters
|
||||||
|
filter_length = 101
|
||||||
|
pad = filter_length // 2
|
||||||
|
|
||||||
|
# Start with delta function
|
||||||
|
drop_filter = paddle.zeros([1, filter_length, 1])
|
||||||
|
drop_filter[0, pad, 0] = 1
|
||||||
|
# Subtract each frequency
|
||||||
|
for frequency in drop_frequency:
|
||||||
|
notch_kernel = notch_filter(
|
||||||
|
frequency,
|
||||||
|
filter_length,
|
||||||
|
self.drop_width, )
|
||||||
|
drop_filter = convolve1d(drop_filter, notch_kernel, pad)
|
||||||
|
|
||||||
|
# Apply filter
|
||||||
|
dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
|
||||||
|
|
||||||
|
# Remove channels dimension if added
|
||||||
|
return dropped_waveform.squeeze(-1)
|
||||||
|
|
||||||
|
|
||||||
|
class DropChunk(nn.Layer):
|
||||||
|
"""This class drops portions of the input signal.
|
||||||
|
Using `DropChunk` as an augmentation strategy helps a models learn to rely
|
||||||
|
on all parts of the signal, since it can't expect a given part to be
|
||||||
|
present.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
drop_length_low : int
|
||||||
|
The low end of lengths for which to set the
|
||||||
|
signal to zero, in samples.
|
||||||
|
drop_length_high : int
|
||||||
|
The high end of lengths for which to set the
|
||||||
|
signal to zero, in samples.
|
||||||
|
drop_count_low : int
|
||||||
|
The low end of number of times that the signal
|
||||||
|
can be dropped to zero.
|
||||||
|
drop_count_high : int
|
||||||
|
The high end of number of times that the signal
|
||||||
|
can be dropped to zero.
|
||||||
|
drop_start : int
|
||||||
|
The first index for which dropping will be allowed.
|
||||||
|
drop_end : int
|
||||||
|
The last index for which dropping will be allowed.
|
||||||
|
drop_prob : float
|
||||||
|
The probability that the batch of signals will
|
||||||
|
have a portion dropped. By default, every batch
|
||||||
|
has portions dropped.
|
||||||
|
noise_factor : float
|
||||||
|
The factor relative to average amplitude of an utterance
|
||||||
|
to use for scaling the white noise inserted. 1 keeps
|
||||||
|
the average amplitude the same, while 0 inserts all 0's.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> from speechbrain.dataio.dataio import read_audio
|
||||||
|
>>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
|
||||||
|
>>> signal = read_audio('tests/samples/single-mic/example1.wav')
|
||||||
|
>>> signal = signal.unsqueeze(0) # [batch, time, channels]
|
||||||
|
>>> length = paddle.ones([1])
|
||||||
|
>>> dropped_signal = dropper(signal, length)
|
||||||
|
>>> float(dropped_signal[:, 150])
|
||||||
|
0.0
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
drop_length_low=100,
|
||||||
|
drop_length_high=1000,
|
||||||
|
drop_count_low=1,
|
||||||
|
drop_count_high=10,
|
||||||
|
drop_start=0,
|
||||||
|
drop_end=None,
|
||||||
|
drop_prob=1,
|
||||||
|
noise_factor=0.0, ):
|
||||||
|
super().__init__()
|
||||||
|
self.drop_length_low = drop_length_low
|
||||||
|
self.drop_length_high = drop_length_high
|
||||||
|
self.drop_count_low = drop_count_low
|
||||||
|
self.drop_count_high = drop_count_high
|
||||||
|
self.drop_start = drop_start
|
||||||
|
self.drop_end = drop_end
|
||||||
|
self.drop_prob = drop_prob
|
||||||
|
self.noise_factor = noise_factor
|
||||||
|
|
||||||
|
# Validate low < high
|
||||||
|
if drop_length_low > drop_length_high:
|
||||||
|
raise ValueError("Low limit must not be more than high limit")
|
||||||
|
if drop_count_low > drop_count_high:
|
||||||
|
raise ValueError("Low limit must not be more than high limit")
|
||||||
|
|
||||||
|
# Make sure the length doesn't exceed end - start
|
||||||
|
if drop_end is not None and drop_end >= 0:
|
||||||
|
if drop_start > drop_end:
|
||||||
|
raise ValueError("Low limit must not be more than high limit")
|
||||||
|
|
||||||
|
drop_range = drop_end - drop_start
|
||||||
|
self.drop_length_low = min(drop_length_low, drop_range)
|
||||||
|
self.drop_length_high = min(drop_length_high, drop_range)
|
||||||
|
|
||||||
|
def forward(self, waveforms, lengths):
|
||||||
|
"""
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveforms : tensor
|
||||||
|
Shape should be `[batch, time]` or `[batch, time, channels]`.
|
||||||
|
lengths : tensor
|
||||||
|
Shape should be a single dimension, `[batch]`.
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor of shape `[batch, time]` or
|
||||||
|
`[batch, time, channels]`
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Reading input list
|
||||||
|
lengths = (lengths * waveforms.shape[1]).long()
|
||||||
|
batch_size = waveforms.shape[0]
|
||||||
|
dropped_waveform = waveforms.clone()
|
||||||
|
|
||||||
|
# Don't drop (return early) 1-`drop_prob` portion of the batches
|
||||||
|
if paddle.rand([1]) > self.drop_prob:
|
||||||
|
return dropped_waveform
|
||||||
|
|
||||||
|
# Store original amplitude for computing white noise amplitude
|
||||||
|
clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
|
||||||
|
|
||||||
|
# Pick a number of times to drop
|
||||||
|
drop_times = paddle.randint(
|
||||||
|
low=self.drop_count_low,
|
||||||
|
high=self.drop_count_high + 1,
|
||||||
|
shape=(batch_size, ), )
|
||||||
|
|
||||||
|
# Iterate batch to set mask
|
||||||
|
for i in range(batch_size):
|
||||||
|
if drop_times[i] == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Pick lengths
|
||||||
|
length = paddle.randint(
|
||||||
|
low=self.drop_length_low,
|
||||||
|
high=self.drop_length_high + 1,
|
||||||
|
shape=(drop_times[i], ), )
|
||||||
|
|
||||||
|
# Compute range of starting locations
|
||||||
|
start_min = self.drop_start
|
||||||
|
if start_min < 0:
|
||||||
|
start_min += lengths[i]
|
||||||
|
start_max = self.drop_end
|
||||||
|
if start_max is None:
|
||||||
|
start_max = lengths[i]
|
||||||
|
if start_max < 0:
|
||||||
|
start_max += lengths[i]
|
||||||
|
start_max = max(0, start_max - length.max())
|
||||||
|
|
||||||
|
# Pick starting locations
|
||||||
|
start = paddle.randint(
|
||||||
|
low=start_min,
|
||||||
|
high=start_max + 1,
|
||||||
|
shape=(drop_times[i], ), )
|
||||||
|
|
||||||
|
end = start + length
|
||||||
|
|
||||||
|
# Update waveform
|
||||||
|
if not self.noise_factor:
|
||||||
|
for j in range(drop_times[i]):
|
||||||
|
dropped_waveform[i, start[j]:end[j]] = 0.0
|
||||||
|
else:
|
||||||
|
# Uniform distribution of -2 to +2 * avg amplitude should
|
||||||
|
# preserve the average for normalization
|
||||||
|
noise_max = 2 * clean_amplitude[i] * self.noise_factor
|
||||||
|
for j in range(drop_times[i]):
|
||||||
|
# zero-center the noise distribution
|
||||||
|
noise_vec = paddle.rand([length[j]])
|
||||||
|
noise_vec = 2 * noise_max * noise_vec - noise_max
|
||||||
|
dropped_waveform[i, start[j]:end[j]] = noise_vec
|
||||||
|
|
||||||
|
return dropped_waveform
|
||||||
|
|
||||||
|
|
||||||
|
class TimeDomainSpecAugment(nn.Layer):
|
||||||
|
"""A time-domain approximation of the SpecAugment algorithm.
|
||||||
|
|
||||||
|
This augmentation module implements three augmentations in
|
||||||
|
the time-domain.
|
||||||
|
|
||||||
|
1. Drop chunks of the audio (zero amplitude or white noise)
|
||||||
|
2. Drop frequency bands (with band-drop filters)
|
||||||
|
3. Speed peturbation (via resampling to slightly different rate)
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
perturb_prob : float from 0 to 1
|
||||||
|
The probability that a batch will have speed perturbation applied.
|
||||||
|
drop_freq_prob : float from 0 to 1
|
||||||
|
The probability that a batch will have frequencies dropped.
|
||||||
|
drop_chunk_prob : float from 0 to 1
|
||||||
|
The probability that a batch will have chunks dropped.
|
||||||
|
speeds : list of ints
|
||||||
|
A set of different speeds to use to perturb each batch.
|
||||||
|
See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
|
||||||
|
sample_rate : int
|
||||||
|
Sampling rate of the input waveforms.
|
||||||
|
drop_freq_count_low : int
|
||||||
|
Lowest number of frequencies that could be dropped.
|
||||||
|
drop_freq_count_high : int
|
||||||
|
Highest number of frequencies that could be dropped.
|
||||||
|
drop_chunk_count_low : int
|
||||||
|
Lowest number of chunks that could be dropped.
|
||||||
|
drop_chunk_count_high : int
|
||||||
|
Highest number of chunks that could be dropped.
|
||||||
|
drop_chunk_length_low : int
|
||||||
|
Lowest length of chunks that could be dropped.
|
||||||
|
drop_chunk_length_high : int
|
||||||
|
Highest length of chunks that could be dropped.
|
||||||
|
drop_chunk_noise_factor : float
|
||||||
|
The noise factor used to scale the white noise inserted, relative to
|
||||||
|
the average amplitude of the utterance. Default 0 (no noise inserted).
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> inputs = paddle.randn([10, 16000])
|
||||||
|
>>> feature_maker = TimeDomainSpecAugment(speeds=[80])
|
||||||
|
>>> feats = feature_maker(inputs, paddle.ones(10))
|
||||||
|
>>> feats.shape
|
||||||
|
paddle.shape([10, 12800])
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
perturb_prob=1.0,
|
||||||
|
drop_freq_prob=1.0,
|
||||||
|
drop_chunk_prob=1.0,
|
||||||
|
speeds=[95, 100, 105],
|
||||||
|
sample_rate=16000,
|
||||||
|
drop_freq_count_low=0,
|
||||||
|
drop_freq_count_high=3,
|
||||||
|
drop_chunk_count_low=0,
|
||||||
|
drop_chunk_count_high=5,
|
||||||
|
drop_chunk_length_low=1000,
|
||||||
|
drop_chunk_length_high=2000,
|
||||||
|
drop_chunk_noise_factor=0, ):
|
||||||
|
super().__init__()
|
||||||
|
self.speed_perturb = SpeedPerturb(
|
||||||
|
perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds)
|
||||||
|
self.drop_freq = DropFreq(
|
||||||
|
drop_prob=drop_freq_prob,
|
||||||
|
drop_count_low=drop_freq_count_low,
|
||||||
|
drop_count_high=drop_freq_count_high, )
|
||||||
|
self.drop_chunk = DropChunk(
|
||||||
|
drop_prob=drop_chunk_prob,
|
||||||
|
drop_count_low=drop_chunk_count_low,
|
||||||
|
drop_count_high=drop_chunk_count_high,
|
||||||
|
drop_length_low=drop_chunk_length_low,
|
||||||
|
drop_length_high=drop_chunk_length_high,
|
||||||
|
noise_factor=drop_chunk_noise_factor, )
|
||||||
|
|
||||||
|
def forward(self, waveforms, lengths):
|
||||||
|
"""Returns the distorted waveforms.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
waveforms : tensor
|
||||||
|
The waveforms to distort
|
||||||
|
"""
|
||||||
|
# Augmentation
|
||||||
|
with paddle.no_grad():
|
||||||
|
waveforms = self.speed_perturb(waveforms)
|
||||||
|
waveforms = self.drop_freq(waveforms)
|
||||||
|
waveforms = self.drop_chunk(waveforms, lengths)
|
||||||
|
return waveforms
|
@ -0,0 +1,228 @@
|
|||||||
|
from collections import defaultdict
|
||||||
|
from typing import Dict
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.nn as nn
|
||||||
|
import paddle.nn.functional as F
|
||||||
|
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2ConfigPure
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2Model
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN
|
||||||
|
from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
|
||||||
|
from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
|
||||||
|
from paddlespeech.s2t.utils.utility import log_add
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2vec2ASR(nn.Layer):
|
||||||
|
def __init__(self, config: dict):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
wav2vec2_config = Wav2Vec2ConfigPure(config)
|
||||||
|
wav2vec2 = Wav2Vec2Model(wav2vec2_config)
|
||||||
|
model_dict = paddle.load(config.wav2vec2_params_path)
|
||||||
|
wav2vec2.set_state_dict(model_dict)
|
||||||
|
self.normalize_wav = config.normalize_wav
|
||||||
|
self.output_norm = config.output_norm
|
||||||
|
if config.freeze_wav2vec2:
|
||||||
|
wav2vec2.eval()
|
||||||
|
for parm in wav2vec2.parameters():
|
||||||
|
parm.trainable = False
|
||||||
|
self.wav2vec2 = wav2vec2
|
||||||
|
self.enc = VanillaNN(
|
||||||
|
input_shape=[None, None, wav2vec2_config.hidden_size],
|
||||||
|
activation=nn.LeakyReLU,
|
||||||
|
dnn_blocks=config.dnn_blocks,
|
||||||
|
dnn_neurons=config.dnn_neurons)
|
||||||
|
self.ctc = CTC(odim=config.output_dim,
|
||||||
|
enc_n_units=config.dnn_neurons,
|
||||||
|
blank_id=config.blank_id,
|
||||||
|
dropout_rate=config.ctc_dropout_rate,
|
||||||
|
reduction='mean')
|
||||||
|
|
||||||
|
def forward(self, wav, wavs_lens_rate, target, target_lens_rate):
|
||||||
|
if self.normalize_wav:
|
||||||
|
wav = F.layer_norm(wav, wav.shape[1:])
|
||||||
|
# Extract wav2vec output
|
||||||
|
out = self.wav2vec2(wav)[0]
|
||||||
|
# We normalize the output if required
|
||||||
|
if self.output_norm:
|
||||||
|
out = F.layer_norm(out, out.shape[1:])
|
||||||
|
feats = out
|
||||||
|
|
||||||
|
x = self.enc(feats)
|
||||||
|
x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
|
||||||
|
target_lens = (target_lens_rate *
|
||||||
|
target.shape[1]).round().astype(paddle.int64)
|
||||||
|
|
||||||
|
ctc_loss = self.ctc(x, x_lens, target, target_lens)
|
||||||
|
return ctc_loss
|
||||||
|
|
||||||
|
@paddle.no_grad()
|
||||||
|
def decode(self,
|
||||||
|
feats: paddle.Tensor,
|
||||||
|
text_feature: Dict[str, int],
|
||||||
|
decoding_method: str,
|
||||||
|
beam_size: int):
|
||||||
|
batch_size = feats.shape[0]
|
||||||
|
|
||||||
|
if decoding_method == 'ctc_prefix_beam_search' and batch_size > 1:
|
||||||
|
logger.error(
|
||||||
|
f'decoding mode {decoding_method} must be running with batch_size == 1'
|
||||||
|
)
|
||||||
|
logger.error(f"current batch_size is {batch_size}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if decoding_method == 'ctc_greedy_search':
|
||||||
|
hyps = self.ctc_greedy_search(feats)
|
||||||
|
res = [text_feature.defeaturize(hyp) for hyp in hyps]
|
||||||
|
res_tokenids = [hyp for hyp in hyps]
|
||||||
|
# ctc_prefix_beam_search and attention_rescoring only return one
|
||||||
|
# result in List[int], change it to List[List[int]] for compatible
|
||||||
|
# with other batch decoding mode
|
||||||
|
elif decoding_method == 'ctc_prefix_beam_search':
|
||||||
|
assert feats.shape[0] == 1
|
||||||
|
hyp = self.ctc_prefix_beam_search(feats, beam_size)
|
||||||
|
res = [text_feature.defeaturize(hyp)]
|
||||||
|
res_tokenids = [hyp]
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"wav2vec2 not support decoding method: {decoding_method}")
|
||||||
|
|
||||||
|
return res, res_tokenids
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_config(cls, config):
|
||||||
|
model = cls(config)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def ctc_greedy_search(self, wav) -> List[List[int]]:
|
||||||
|
""" Apply CTC greedy search
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
Returns:
|
||||||
|
List[List[int]]: best path result
|
||||||
|
"""
|
||||||
|
batch_size = wav.shape[0]
|
||||||
|
wav = wav[:, :, 0]
|
||||||
|
if self.normalize_wav:
|
||||||
|
wav = F.layer_norm(wav, wav.shape[1:])
|
||||||
|
# Extract wav2vec output
|
||||||
|
out = self.wav2vec2(wav)[0]
|
||||||
|
# We normalize the output if required
|
||||||
|
if self.output_norm:
|
||||||
|
out = F.layer_norm(out, out.shape[1:])
|
||||||
|
feats = out
|
||||||
|
x = self.enc(feats)
|
||||||
|
x_lens = x.shape[1]
|
||||||
|
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
|
||||||
|
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
|
||||||
|
topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen)
|
||||||
|
|
||||||
|
hyps = [hyp.tolist() for hyp in topk_index]
|
||||||
|
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
|
||||||
|
return hyps
|
||||||
|
|
||||||
|
def _ctc_prefix_beam_search(
|
||||||
|
self,
|
||||||
|
wav,
|
||||||
|
beam_size,
|
||||||
|
blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
|
||||||
|
""" CTC prefix beam search inner implementation
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
|
||||||
|
paddle.Tensor: encoder output, (1, max_len, encoder_dim),
|
||||||
|
it will be used for rescoring in attention rescoring mode
|
||||||
|
"""
|
||||||
|
wav = wav[:, :, 0]
|
||||||
|
|
||||||
|
if self.normalize_wav:
|
||||||
|
wav = F.layer_norm(wav, wav.shape[1:])
|
||||||
|
# Extract wav2vec output
|
||||||
|
out = self.wav2vec2(wav)[0]
|
||||||
|
# We normalize the output if required
|
||||||
|
if self.output_norm:
|
||||||
|
out = F.layer_norm(out, out.shape[1:])
|
||||||
|
feats = out
|
||||||
|
|
||||||
|
x = self.enc(feats)
|
||||||
|
maxlen = x.shape[1]
|
||||||
|
ctc_probs = self.ctc.log_softmax(x) # (1, maxlen, vocab_size)
|
||||||
|
ctc_probs = ctc_probs.squeeze(0)
|
||||||
|
|
||||||
|
# cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
|
||||||
|
# blank_ending_score and none_blank_ending_score in ln domain
|
||||||
|
cur_hyps = [(tuple(), (0.0, -float('inf')))]
|
||||||
|
# 2. CTC beam search step by step
|
||||||
|
for t in range(0, maxlen):
|
||||||
|
logp = ctc_probs[t] # (vocab_size,)
|
||||||
|
# key: prefix, value (pb, pnb), default value(-inf, -inf)
|
||||||
|
next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
|
||||||
|
# 2.1 First beam prune: select topk best
|
||||||
|
top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,)
|
||||||
|
for s in top_k_index:
|
||||||
|
s = s.item()
|
||||||
|
ps = logp[s].item()
|
||||||
|
for prefix, (pb, pnb) in cur_hyps:
|
||||||
|
last = prefix[-1] if len(prefix) > 0 else None
|
||||||
|
if s == blank_id: # blank
|
||||||
|
n_pb, n_pnb = next_hyps[prefix]
|
||||||
|
n_pb = log_add([n_pb, pb + ps, pnb + ps])
|
||||||
|
next_hyps[prefix] = (n_pb, n_pnb)
|
||||||
|
elif s == last:
|
||||||
|
# Update *ss -> *s;
|
||||||
|
n_pb, n_pnb = next_hyps[prefix]
|
||||||
|
n_pnb = log_add([n_pnb, pnb + ps])
|
||||||
|
next_hyps[prefix] = (n_pb, n_pnb)
|
||||||
|
# Update *s-s -> *ss, - is for blank
|
||||||
|
n_prefix = prefix + (s, )
|
||||||
|
n_pb, n_pnb = next_hyps[n_prefix]
|
||||||
|
n_pnb = log_add([n_pnb, pb + ps])
|
||||||
|
next_hyps[n_prefix] = (n_pb, n_pnb)
|
||||||
|
else:
|
||||||
|
n_prefix = prefix + (s, )
|
||||||
|
n_pb, n_pnb = next_hyps[n_prefix]
|
||||||
|
n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
|
||||||
|
next_hyps[n_prefix] = (n_pb, n_pnb)
|
||||||
|
|
||||||
|
# 2.2 Second beam prune
|
||||||
|
next_hyps = sorted(
|
||||||
|
next_hyps.items(),
|
||||||
|
key=lambda x: log_add(list(x[1])),
|
||||||
|
reverse=True)
|
||||||
|
cur_hyps = next_hyps[:beam_size]
|
||||||
|
|
||||||
|
hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
|
||||||
|
return hyps
|
||||||
|
|
||||||
|
def ctc_prefix_beam_search(self, wav, beam_size) -> List[int]:
|
||||||
|
""" Apply CTC prefix beam search
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
List[int]: CTC prefix beam search nbest results
|
||||||
|
"""
|
||||||
|
hyps = self._ctc_prefix_beam_search(wav, beam_size)
|
||||||
|
return hyps[0][0]
|
Loading…
Reference in new issue