Merge branch 'develop' into server_asr

pull/1475/head
WilliamZhang06 3 years ago committed by GitHub
commit b8f16ac9b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,89 @@
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
cmd_backend='local'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# "qsub" (SGE, Torque, PBS, etc.)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl"
export cuda_cmd="queue.pl"
export decode_cmd="queue.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi

@ -0,0 +1,2 @@
--sample-frequency=16000
--num-mel-bins=80

@ -0,0 +1 @@
--sample-frequency=16000

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.de.train
dev_manifest: data/manifest.de.dev
test_manifest: data/manifest.de.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-de.de_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-de.de_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.es.train
dev_manifest: data/manifest.es.dev
test_manifest: data/manifest.es.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-es.es_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-es.es_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.fr.train
dev_manifest: data/manifest.fr.dev
test_manifest: data/manifest.fr.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-fr.fr_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-fr.fr_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.it.train
dev_manifest: data/manifest.it.dev
test_manifest: data/manifest.it.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-it.it_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-it.it_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.nl.train
dev_manifest: data/manifest.nl.dev
test_manifest: data/manifest.nl.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-nl.nl_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-nl.nl_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.pt.train
dev_manifest: data/manifest.pt.dev
test_manifest: data/manifest.pt.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-pt.pt_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-pt.pt_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.ro.train
dev_manifest: data/manifest.ro.dev
test_manifest: data/manifest.ro.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-ro.ro_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-ro.ro_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,90 @@
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.ru.train
dev_manifest: data/manifest.ru.dev
test_manifest: data/manifest.ru.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_1spm/train_sp.en-ru.ru_bpe8000_units_tc.txt
unit_type: 'spm'
spm_model_prefix: data/lang_1spm/train_sp.en-ru.ru_bpe8000_tc
mean_std_filepath: ""
# preprocess_config: conf/augmentation.json
batch_size: 20
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config:
num_workers: 0
subsampling_factor: 1
num_encs: 1
############################################
# Network Architecture #
############################################
cmvn_file: None
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 40
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 2.5
weight_decay: 0.
scheduler: noam
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 50
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,19 @@
[
{
"type": "specaug",
"params": {
"W": 5,
"warp_mode": "PIL",
"F": 30,
"n_freq_masks": 2,
"T": 40,
"n_time_masks": 2,
"p": 1.0,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20,
"replace_with_zero": false
},
"prob": 1.0
}
]

@ -0,0 +1,201 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# 2021 PaddlePaddle
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
set -e
set -u
stage=-1
stop_stage=10
# bpemode (unigram or bpe)
tgt_lang=
nbpe=8000
bpemode=bpe
must_c=
dumpdir=data/dump
do_delta=false
tgt_case=tc
src_case=lc.rm
source ${MAIN_ROOT}/utils/parse_options.sh
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
train_set=train_sp.en-${tgt_lang}.${tgt_lang}
train_dev=dev.en-${tgt_lang}.${tgt_lang}
trans_set=""
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
done
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${must_c} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
exit 1
fi
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data Preparation"
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
local/data_prep.sh ${must_c} ${lang}
done
fi
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### Task dependent. You have to design training and dev sets by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 1: Feature Generation"
fbankdir=fbank
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/${x} data/make_fbank/${x} ${fbankdir}
done
done
# speed-perturbed
utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
for lang in en ${tgt_lang}; do
awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
done
# Divide into source and target languages
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
local/divide_lang.sh ${x} ${tgt_lang}
done
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
# remove utt having more than 3000 frames
# remove utt having more than 400 characters
for lang in ${tgt_lang} en; do
remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
done
# Match the number of utterances between source and target languages
# extract commocn lines
cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
for lang in ${tgt_lang} en; do
reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
done
rm -rf data/${x}.*.tmp
done
# compute global CMVN
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
# dump features for training
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
${feat_tr_dir}/storage
fi
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
${feat_dt_dir}/storage
fi
dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
${feat_trans_dir}
done
fi
dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_1spm/
export LC_ALL=C.UTF-8
echo "make a non-linguistic symbol list for all languages"
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
cat ${nlsyms}
echo "make a joint source and target dictionary"
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
offset=$(wc -l < ${dict})
grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
wc -l ${dict}
echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
done
echo "update json (add source references)"
# update json (add source references)
for x in ${train_set} ${train_dev}; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
x=(${train_set} ${train_dev} ${trans_set})
y=(train dev test)
for (( i=0; i<${#x[*]}; ++i)); do
echo ${x[$i]} ${y[$i]}
feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
--json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--manifest-file data/manifest.${tgt_lang}.${y[$i]}
echo "Process done for ${y[$i]} set from ${x[$i]}"
done
fi
echo "MuST-C ${tgt_lang} Data preparation done."
exit 0

@ -0,0 +1,163 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
export LC_ALL=C
source ${MAIN_ROOT}/utils/parse_options.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir>"
echo "e.g.: $0 /n/rd11/corpora_8/MUSTC_v1.0 target_lang"
exit 1;
fi
tgt_lang=$2
for set in train dev tst-COMMON tst-HE; do
src=$1/en-${tgt_lang}/data/${set}
dst=data/local/en-${tgt_lang}/${set}
[ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
wav_dir=${src}/wav
trans_dir=${src}/txt
yml=${trans_dir}/${set}.yaml
en=${trans_dir}/${set}.en
tgt=${trans_dir}/${set}.${tgt_lang}
mkdir -p ${dst} || exit 1;
[ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
[ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
[ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
[ ! -f ${en} ] && echo "$0: expected file ${en} to exist" && exit 1;
[ ! -f ${tgt} ] && echo "$0: expected file ${tgt} to exist" && exit 1;
wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
# error check
n=$(cat ${yml} | grep duration | wc -l)
n_en=$(cat ${en} | wc -l)
n_tgt=$(cat ${tgt} | wc -l)
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
# (1a) Transcriptions and translations preparation
# make basic transcription file (add segments info)
cp ${yml} ${dst}/.yaml0
grep duration ${dst}/.yaml0 > ${dst}/.yaml1
awk '{
duration=$3; offset=$5; spkid=$7;
gsub(",","",duration);
gsub(",","",offset);
gsub(",","",spkid);
gsub("spk.","",spkid);
duration=sprintf("%.7f", duration);
if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
else extendt=0;
offset=sprintf("%.7f", offset);
startt=offset-extendt;
endt=offset+duration+extendt;
printf("ted_%05d_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
}' ${dst}/.yaml1 > ${dst}/.yaml2
# NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
cp ${en} ${dst}/en.org
cp ${tgt} ${dst}/${tgt_lang}.org
for lang in en ${tgt_lang}; do
# normalize punctuation
normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
# lowercasing
lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
# remove punctuation
local/remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
# tokenization
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.tc.tok | sort > ${dst}/text.tc.${lang}
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.tok | sort > ${dst}/text.lc.${lang}
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.rm.tok | sort > ${dst}/text.lc.rm.${lang}
# save original and cleaned punctuation
lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
done
# error check
n=$(cat ${dst}/.yaml2 | wc -l)
n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
# (1c) Make segments files from transcript
#segments file format is: utt-id start-time end-time, e.g.:
#ted_00001_0003501_0003684 ted_0001 003.501 0003.684
awk '{
segment=$1; split(segment,S,"[_]");
spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];
printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
awk '{
segment=$1; split(segment,S,"[_]");
spkid=S[1] "_" S[2];
printf("%s cat '${wav_dir}'/%s_%d.wav |\n", spkid, S[1], S[2]);
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
awk '{
segment=$1; split(segment,S,"[_]");
spkid=S[1] "_" S[2]; print $1 " " spkid
}' ${dst}/segments | uniq | sort > ${dst}/utt2spk
cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
# error check
n_en=$(cat ${dst}/text.tc.en | wc -l)
n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
[ ${n_en} -ne ${n_tgt} ] && echo "Warning: expected ${n_en} data data files, found ${n_tgt}" && exit 1;
# Copy stuff intoc its final locations [this has been moved from the format_data script]
mkdir -p data/${set}.en-${tgt_lang}
# remove duplicated utterances (the same offset)
echo "remove duplicate lines..."
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted' \
| sed 's/^[ \t]*//' > ${dst}/duplicate_lines
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted' \
| cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.en-${tgt_lang}
for l in en ${tgt_lang}; do
for case in tc lc lc.rm; do
cp ${dst}/text.${case}.${l} data/${set}.en-${tgt_lang}/text.${case}.${l}
done
done
utils/fix_data_dir.sh --utt_extra_files \
"text.tc.en text.lc.en text.lc.rm.en text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" \
data/${set}.en-${tgt_lang}
# error check
n_seg=$(cat data/${set}.en-${tgt_lang}/segments | wc -l)
n_text=$(cat data/${set}.en-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
[ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data data files, found ${n_text}" && exit 1;
echo "$0: successfully prepared data in ${dst}"
done

@ -0,0 +1,52 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
. ./path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <set> <lang>>"
echo "e.g.: $0 dev"
exit 1
fi
set=$1
lang=$2
export LC_ALL=en_US.UTF-8
# Copy stuff intoc its final locations [this has been moved from the format_data script]
# for En
mkdir -p data/${set}.en
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f data/${set}/${f} ]; then
sort data/${set}/${f} > data/${set}.en/${f}
fi
done
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text # dummy
sort data/${set}/text.tc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.tc
sort data/${set}/text.lc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc.rm
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
if [ -f data/${set}.en/feats.scp ]; then
utils/validate_data_dir.sh data/${set}.en || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
fi
# for target language
mkdir -p data/${set}.${lang}
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f data/${set}/${f} ]; then
sort data/${set}/${f} > data/${set}.${lang}/${f}
fi
done
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text # dummy
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.tc
sort data/${set}/text.lc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc
sort data/${set}/text.lc.rm.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc.rm
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
if [ -f data/${set}.${lang}/feats.scp ]; then
utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
fi

@ -0,0 +1,25 @@
#!/usr/bin/perl
use warnings;
use strict;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
while(<STDIN>) {
$_ = " $_ ";
# remove punctuation except apostrophe
s/<space>/spacemark/g; # for scoring
s/'/apostrophe/g;
s/[[:punct:]]//g;
s/apostrophe/'/g;
s/spacemark/<space>/g; # for scoring
# remove whitespace
s/\s+/ /g;
s/^\s+//;
s/\s+$//;
print "$_\n";
}

@ -0,0 +1,48 @@
#! /usr/bin/env bash
if [ $# != 4 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix lang"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
decode_config_path=$2
ckpt_prefix=$3
tgt_lang=$4
for type in fullsentence; do
echo "decoding ${type}"
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
echo $PATH
python3 ${MAIN_ROOT}/utils/rsl2trn.py --rsl ${ckpt_prefix}.${type}.rsl \
--hyp ${ckpt_prefix}.${type}.hyp \
--ref ${ckpt_prefix}.${type}.ref
if ! which tokenizer.perl > /dev/null; then
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
return 1
fi
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.hyp > ${ckpt_prefix}.${type}.hyp.detok
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.ref > ${ckpt_prefix}.${type}.ref.detok
echo "Detokenized BLEU:"
sacrebleu ${ckpt_prefix}.${type}.ref.detok -i ${ckpt_prefix}.${type}.hyp.detok
done
exit 0

@ -0,0 +1,40 @@
#!/bin/bash
if [ $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ckpt_path=$3
mkdir -p exp
# seed may break model convergence
seed=0
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--checkpoint_path "${ckpt_path}" \
--seed ${seed}
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then
echo "Failed in training!"
exit 1
fi
exit 0

@ -0,0 +1,29 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${MAIN_ROOT}/tools/moses/scripts/tokenizer:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
if ! which tokenizer.perl > /dev/null; then
echo "Error: moses is required in this example." >&2
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && git clone https://github.com/moses-smt/mosesdecoder.git moses" >&2
return 1
fi
MODEL=u2_st
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# Kaldi
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh

@ -0,0 +1,39 @@
#!/bin/bash
set -e
. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
gpus=0,1,2,3
stage=0
stop_stage=3
conf_path=conf/transformer_es.yaml
decode_conf_path=conf/tuning/decode.yaml
must_c_path=
lang=es
avg_num=5
ckpt_path= # (finetune from FAT-ST or ASR pretrained model)
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh --tgt_lang ${lang} --must_c ${must_c_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
fi

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/steps

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/utils

@ -198,10 +198,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
x=(${feat_tr_dir} ${feat_dt_dir} ${feat_trans_dir})
y=(train dev test)
echo "stage 3: Format the Json Data"
python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
for (( i=0; i<${#x[*]}; ++i)); do
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
--json-file ${x[$i]}/data_${bpemode}${nbpe}.json
--manifest-file data/manifest.${y[$i]}
done
fi
echo "Ted En-Zh Data preparation done."
exit 0

@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
import sys
from collections import OrderedDict
from typing import List
from typing import Optional
from typing import Union
@ -130,7 +132,7 @@ class ASRExecutor(BaseExecutor):
self.parser = argparse.ArgumentParser(
prog='paddlespeech.asr', add_help=True)
self.parser.add_argument(
'--input', type=str, required=True, help='Audio file to recognize.')
'--input', type=str, default=None, help='Audio file to recognize.')
self.parser.add_argument(
'--model',
type=str,
@ -180,6 +182,11 @@ class ASRExecutor(BaseExecutor):
type=str,
default=paddle.get_device(),
help='Choose device to execute model inference.')
self.parser.add_argument(
'--job_dump_result',
type=ast.literal_eval,
default=False,
help='Save job result into file.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@ -469,19 +476,31 @@ class ASRExecutor(BaseExecutor):
sample_rate = parser_args.sample_rate
config = parser_args.config
ckpt_path = parser_args.ckpt_path
audio_file = parser_args.input
decode_method = parser_args.decode_method
force_yes = parser_args.yes
device = parser_args.device
job_dump_result = parser_args.job_dump_result
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
for id_, input_ in task_source.items():
try:
res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
res = self(input_, model, lang, sample_rate, config, ckpt_path,
decode_method, force_yes, device)
logger.info('ASR Result: {}'.format(res))
return True
task_results[id_] = res
except Exception as e:
logger.exception(e)
has_exceptions = True
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
job_dump_result)
if has_exceptions:
return False
else:
return True
@stats_wrapper
def __call__(self,

@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
from collections import OrderedDict
from typing import List
from typing import Optional
from typing import Union
@ -77,7 +79,7 @@ class CLSExecutor(BaseExecutor):
self.parser = argparse.ArgumentParser(
prog='paddlespeech.cls', add_help=True)
self.parser.add_argument(
'--input', type=str, required=True, help='Audio file to classify.')
'--input', type=str, default=None, help='Audio file to classify.')
self.parser.add_argument(
'--model',
type=str,
@ -109,6 +111,11 @@ class CLSExecutor(BaseExecutor):
type=str,
default=paddle.get_device(),
help='Choose device to execute model inference.')
self.parser.add_argument(
'--job_dump_result',
type=ast.literal_eval,
default=False,
help='Save job result into file.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@ -214,7 +221,7 @@ class CLSExecutor(BaseExecutor):
ret = ''
for idx in topk_idx:
label, score = self._label_list[idx], result[idx]
ret += f'{label}: {score}\n'
ret += f'{label} {score} '
return ret
def postprocess(self, topk: int) -> Union[str, os.PathLike]:
@ -234,18 +241,30 @@ class CLSExecutor(BaseExecutor):
label_file = parser_args.label_file
cfg_path = parser_args.config
ckpt_path = parser_args.ckpt_path
audio_file = parser_args.input
topk = parser_args.topk
device = parser_args.device
job_dump_result = parser_args.job_dump_result
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
for id_, input_ in task_source.items():
try:
res = self(audio_file, model_type, cfg_path, ckpt_path, label_file,
res = self(input_, model_type, cfg_path, ckpt_path, label_file,
topk, device)
logger.info('CLS Result:\n{}'.format(res))
return True
task_results[id_] = res
except Exception as e:
logger.exception(e)
has_exceptions = True
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
job_dump_result)
if has_exceptions:
return False
else:
return True
@stats_wrapper
def __call__(self,
@ -259,7 +278,7 @@ class CLSExecutor(BaseExecutor):
"""
Python API to call an executor.
"""
audio_file = os.path.abspath(audio_file)
audio_file = os.path.abspath(os.path.expanduser(audio_file))
paddle.set_device(device)
self._init_from_path(model, config, ckpt_path, label_file)
self.preprocess(audio_file)

@ -12,14 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from abc import ABC
from abc import abstractmethod
from collections import OrderedDict
from typing import Any
from typing import Dict
from typing import List
from typing import Union
import paddle
from .log import logger
class BaseExecutor(ABC):
"""
@ -27,8 +32,8 @@ class BaseExecutor(ABC):
"""
def __init__(self):
self._inputs = dict()
self._outputs = dict()
self._inputs = OrderedDict()
self._outputs = OrderedDict()
@abstractmethod
def _get_pretrained_path(self, tag: str) -> os.PathLike:
@ -100,3 +105,107 @@ class BaseExecutor(ABC):
Python API to call an executor.
"""
pass
def get_task_source(self, input_: Union[str, os.PathLike, None]
) -> Dict[str, Union[str, os.PathLike]]:
"""
Get task input source from command line input.
Args:
input_ (Union[str, os.PathLike, None]): Input from command line.
Returns:
Dict[str, Union[str, os.PathLike]]: A dict with ids and inputs.
"""
if self._is_job_input(input_):
ret = self._get_job_contents(input_)
else:
ret = OrderedDict()
if input_ is None: # Take input from stdin
for i, line in enumerate(sys.stdin):
line = line.strip()
if len(line.split(' ')) == 1:
ret[str(i + 1)] = line
elif len(line.split(' ')) == 2:
id_, info = line.split(' ')
ret[id_] = info
else: # No valid input info from one line.
continue
else:
ret[1] = input_
return ret
def process_task_results(self,
input_: Union[str, os.PathLike, None],
results: Dict[str, os.PathLike],
job_dump_result: bool=False):
"""
Handling task results and redirect stdout if needed.
Args:
input_ (Union[str, os.PathLike, None]): Input from command line.
results (Dict[str, os.PathLike]): Task outputs.
job_dump_result (bool, optional): if True, dumps job results into file. Defaults to False.
"""
raw_text = self._format_task_results(results)
print(raw_text, end='')
if self._is_job_input(input_) and job_dump_result:
try:
job_output_file = os.path.abspath(input_) + '.done'
sys.stdout = open(job_output_file, 'w')
print(raw_text, end='')
logger.info(f'Results had been saved to: {job_output_file}')
finally:
sys.stdout.close()
def _is_job_input(self, input_: Union[str, os.PathLike]) -> bool:
"""
Check if current input file is a job input or not.
Args:
input_ (Union[str, os.PathLike]): Input file of current task.
Returns:
bool: return `True` for job input, `False` otherwise.
"""
return input_ and os.path.isfile(input_) and input_.endswith('.job')
def _get_job_contents(
self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
"""
Read a job input file and return its contents in a dictionary.
Args:
job_input (os.PathLike): The job input file.
Returns:
Dict[str, str]: Contents of job input.
"""
job_contents = OrderedDict()
with open(job_input) as f:
for line in f:
line = line.strip()
if not line:
continue
k, v = line.split(' ')
job_contents[k] = v
return job_contents
def _format_task_results(
self, results: Dict[str, Union[str, os.PathLike]]) -> str:
"""
Convert task results to raw text.
Args:
results (Dict[str, str]): A dictionary of task results.
Returns:
str: A string object contains task results.
"""
ret = ''
for k, v in results.items():
ret += f'{k} {v}\n'
return ret

@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
import subprocess
from collections import OrderedDict
from typing import List
from typing import Optional
from typing import Union
@ -69,7 +71,7 @@ class STExecutor(BaseExecutor):
self.parser = argparse.ArgumentParser(
prog="paddlespeech.st", add_help=True)
self.parser.add_argument(
"--input", type=str, required=True, help="Audio file to translate.")
"--input", type=str, default=None, help="Audio file to translate.")
self.parser.add_argument(
"--model",
type=str,
@ -107,6 +109,11 @@ class STExecutor(BaseExecutor):
type=str,
default=paddle.get_device(),
help="Choose device to execute model inference.")
self.parser.add_argument(
'--job_dump_result',
type=ast.literal_eval,
default=False,
help='Save job result into file.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@ -319,17 +326,29 @@ class STExecutor(BaseExecutor):
sample_rate = parser_args.sample_rate
config = parser_args.config
ckpt_path = parser_args.ckpt_path
audio_file = parser_args.input
device = parser_args.device
job_dump_result = parser_args.job_dump_result
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
for id_, input_ in task_source.items():
try:
res = self(audio_file, model, src_lang, tgt_lang, sample_rate,
res = self(input_, model, src_lang, tgt_lang, sample_rate,
config, ckpt_path, device)
logger.info("ST Result: {}".format(res))
return True
task_results[id_] = res
except Exception as e:
logger.exception(e)
has_exceptions = True
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
job_dump_result)
if has_exceptions:
return False
else:
return True
@stats_wrapper
def __call__(self,

@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
import re
from collections import OrderedDict
from typing import List
from typing import Optional
from typing import Union
@ -80,7 +82,7 @@ class TextExecutor(BaseExecutor):
self.parser = argparse.ArgumentParser(
prog='paddlespeech.text', add_help=True)
self.parser.add_argument(
'--input', type=str, required=True, help='Input text.')
'--input', type=str, default=None, help='Input text.')
self.parser.add_argument(
'--task',
type=str,
@ -119,6 +121,11 @@ class TextExecutor(BaseExecutor):
type=str,
default=paddle.get_device(),
help='Choose device to execute model inference.')
self.parser.add_argument(
'--job_dump_result',
type=ast.literal_eval,
default=False,
help='Save job result into file.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@ -256,7 +263,6 @@ class TextExecutor(BaseExecutor):
"""
parser_args = self.parser.parse_args(argv)
text = parser_args.input
task = parser_args.task
model_type = parser_args.model
lang = parser_args.lang
@ -264,15 +270,28 @@ class TextExecutor(BaseExecutor):
ckpt_path = parser_args.ckpt_path
punc_vocab = parser_args.punc_vocab
device = parser_args.device
job_dump_result = parser_args.job_dump_result
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
for id_, input_ in task_source.items():
try:
res = self(text, task, model_type, lang, cfg_path, ckpt_path,
res = self(input_, task, model_type, lang, cfg_path, ckpt_path,
punc_vocab, device)
logger.info('Text Result:\n{}'.format(res))
return True
task_results[id_] = res
except Exception as e:
logger.exception(e)
has_exceptions = True
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
job_dump_result)
if has_exceptions:
return False
else:
return True
@stats_wrapper
def __call__(

@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
from collections import OrderedDict
from typing import Any
from typing import List
from typing import Optional
@ -298,7 +300,7 @@ class TTSExecutor(BaseExecutor):
self.parser = argparse.ArgumentParser(
prog='paddlespeech.tts', add_help=True)
self.parser.add_argument(
'--input', type=str, required=True, help='Input text to generate.')
'--input', type=str, default=None, help='Input text to generate.')
# acoustic model
self.parser.add_argument(
'--am',
@ -397,6 +399,11 @@ class TTSExecutor(BaseExecutor):
self.parser.add_argument(
'--output', type=str, default='output.wav', help='output file name')
self.parser.add_argument(
'--job_dump_result',
type=ast.literal_eval,
default=False,
help='Save job result into file.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@ -671,7 +678,6 @@ class TTSExecutor(BaseExecutor):
args = self.parser.parse_args(argv)
text = args.input
am = args.am
am_config = args.am_config
am_ckpt = args.am_ckpt
@ -686,12 +692,24 @@ class TTSExecutor(BaseExecutor):
voc_stat = args.voc_stat
lang = args.lang
device = args.device
output = args.output
spk_id = args.spk_id
job_dump_result = args.job_dump_result
task_source = self.get_task_source(args.input)
task_results = OrderedDict()
has_exceptions = False
for id_, input_ in task_source.items():
if len(task_source) > 1:
assert isinstance(args.output,
str) and args.output.endswith('.wav')
output = args.output.replace('.wav', f'_{id_}.wav')
else:
output = args.output
try:
res = self(
text=text,
text=input_,
# acoustic model related
am=am,
am_config=am_config,
@ -710,11 +728,17 @@ class TTSExecutor(BaseExecutor):
lang=lang,
device=device,
output=output)
logger.info('Wave file has been generated: {}'.format(res))
return True
task_results[id_] = res
except Exception as e:
logger.exception(e)
has_exceptions = True
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(args.input, task_results, job_dump_result)
if has_exceptions:
return False
else:
return True
@stats_wrapper
def __call__(self,

@ -0,0 +1,33 @@
# PaddleSpeech Server Command Line
([简体中文](./README_cn.md)|English)
The simplest approach to use PaddleSpeech Server including server and client.
## PaddleSpeech Server
### Help
```bash
paddlespeech_server help
```
### Start the server
First set the service-related configuration parameters, similar to `./conf/application.yaml`,
Then start the service:
```bash
paddlespeech_server start --config_file ./conf/application.yaml
```
## PaddleSpeech Client
### Help
```bash
paddlespeech_client help
```
### Access speech recognition services
```
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./tests/16_audio.wav
```
### Access text to speech services
```bash
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```

@ -0,0 +1,32 @@
# PaddleSpeech Server 命令行工具
(简体中文|[English](./README.md))
它提供了最简便的方式调用 PaddleSpeech 语音服务用一行命令就可以轻松启动服务和调用服务。
## 服务端命令行使用
### 帮助
```bash
paddlespeech_server help
```
### 启动服务
首先设置服务相关配置文件,类似于 `./conf/application.yaml`,同时设置服务配置中的语音任务模型相关配置,类似于 `./conf/tts/tts.yaml`
然后启动服务:
```bash
paddlespeech_server start --config_file ./conf/application.yaml
```
## 客户端命令行使用
### 帮助
```bash
paddlespeech_client help
```
### 访问语音识别服务
```
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
```
### 访问语音合成服务
```bash
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```

@ -24,6 +24,7 @@ import numpy as np
import requests
import soundfile
from ..executor import BaseExecutor
from ..util import cli_client_register
from paddlespeech.server.utils.audio_process import wav2pcm
from paddlespeech.server.utils.util import wav2base64
@ -33,7 +34,7 @@ __all__ = ['TTSClientExecutor', 'ASRClientExecutor']
@cli_client_register(
name='paddlespeech_client.tts', description='visit tts service')
class TTSClientExecutor():
class TTSClientExecutor(BaseExecutor):
def __init__(self):
super().__init__()
self.parser = argparse.ArgumentParser()
@ -42,7 +43,7 @@ class TTSClientExecutor():
self.parser.add_argument(
'--port', type=int, default=8090, help='server port')
self.parser.add_argument(
'--text',
'--input',
type=str,
default="你好,欢迎使用语音合成服务",
help='A sentence to be synthesized')
@ -60,20 +61,20 @@ class TTSClientExecutor():
self.parser.add_argument(
'--output',
type=str,
default="./out.wav",
default="./output.wav",
help='Synthesized audio file')
# Request and response
def tts_client(self, args):
""" Request and response
Args:
text: A sentence to be synthesized
input: A sentence to be synthesized
outfile: Synthetic audio file
"""
url = 'http://' + args.server_ip + ":" + str(
args.port) + '/paddlespeech/tts'
request = {
"text": args.text,
"text": args.input,
"spk_id": args.spk_id,
"speed": args.speed,
"volume": args.volume,
@ -119,7 +120,7 @@ class TTSClientExecutor():
@cli_client_register(
name='paddlespeech_client.asr', description='visit asr service')
class ASRClientExecutor():
class ASRClientExecutor(BaseExecutor):
def __init__(self):
super().__init__()
self.parser = argparse.ArgumentParser()
@ -128,29 +129,34 @@ class ASRClientExecutor():
self.parser.add_argument(
'--port', type=int, default=8090, help='server port')
self.parser.add_argument(
'--audio_file',
'--input',
type=str,
default="./paddlespeech/server/tests/16_audio.wav",
help='Audio file to be recognized')
self.parser.add_argument(
'--sample_rate', type=int, default=16000, help='audio sample rate')
self.parser.add_argument(
'--lang', type=str, default="zh_cn", help='language')
self.parser.add_argument(
'--audio_format', type=str, default="wav", help='audio format')
def execute(self, argv: List[str]) -> bool:
args = self.parser.parse_args(argv)
url = 'http://' + args.server_ip + ":" + str(
args.port) + '/paddlespeech/asr'
audio = wav2base64(args.audio_file)
audio = wav2base64(args.input)
data = {
"audio": audio,
"audio_format": "wav",
"audio_format": args.audio_format,
"sample_rate": args.sample_rate,
"lang": "zh_cn",
"lang": args.lang,
}
time_start = time.time()
try:
r = requests.post(url=url, data=json.dumps(data))
# ending Timestamp
time_end = time.time()
print(r.json())
print('time cost', time_end - time_start, 's')
except:
print("Failed to speech recognition.")

@ -17,6 +17,7 @@ from typing import List
import uvicorn
from fastapi import FastAPI
from ..executor import BaseExecutor
from ..util import cli_server_register
from paddlespeech.server.engine.engine_factory import EngineFactory
from paddlespeech.server.restful.api import setup_router
@ -29,8 +30,8 @@ app = FastAPI(
@cli_server_register(
name='paddlespeech_server.server', description='Start the service')
class ServerExecutor():
name='paddlespeech_server.start', description='Start the service')
class ServerExecutor(BaseExecutor):
def __init__(self):
super().__init__()
self.parser = argparse.ArgumentParser()
@ -48,10 +49,8 @@ class ServerExecutor():
def init(self, config) -> bool:
"""system initialization
Args:
config (CfgNode): config object
Returns:
bool:
"""
@ -75,4 +74,4 @@ class ServerExecutor():
config = get_config(args.config_file)
if self.init(config):
uvicorn.run(app, host=config.host, port=config.port, debug=True)
uvicorn.run(app, host=config.host, port=config.port, debug=True)v

@ -0,0 +1,38 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from abc import ABC
from abc import abstractmethod
from typing import List
class BaseExecutor(ABC):
"""
An abstract executor of paddlespeech server tasks.
"""
def __init__(self):
self.parser = argparse.ArgumentParser()
@abstractmethod
def execute(self, argv: List[str]) -> bool:
"""
Command line entry. This method can only be accessed by a command line such as `paddlespeech asr`.
Args:
argv (List[str]): Arguments from command line.
Returns:
int: Result of the command execution. `True` for a success and `False` for a failure.
"""
pass

@ -0,0 +1,59 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the
import requests
import json
import time
import base64
import io
def readwav2base64(wav_file):
"""
read wave file and covert to base64 string
"""
with open(wav_file, 'rb') as f:
base64_bytes = base64.b64encode(f.read())
base64_string = base64_bytes.decode('utf-8')
return base64_string
def main():
"""
main func
"""
url = "http://127.0.0.1:8090/paddlespeech/asr"
# start Timestamp
time_start=time.time()
test_audio_dir = "./16_audio.wav"
audio = readwav2base64(test_audio_dir)
data = {
"audio": audio,
"audio_format": "wav",
"sample_rate": 16000,
"lang": "zh_cn",
}
r = requests.post(url=url, data=json.dumps(data))
# ending Timestamp
time_end=time.time()
print('time cost',time_end - time_start, 's')
print(r.json())
if __name__ == "__main__":
main()
Loading…
Cancel
Save