commit
b8f16ac9b0
@ -0,0 +1,2 @@
|
|||||||
|
--sample-frequency=16000
|
||||||
|
--num-mel-bins=80
|
@ -0,0 +1 @@
|
|||||||
|
--sample-frequency=16000
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.de.train
|
||||||
|
dev_manifest: data/manifest.de.dev
|
||||||
|
test_manifest: data/manifest.de.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-de.de_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-de.de_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.es.train
|
||||||
|
dev_manifest: data/manifest.es.dev
|
||||||
|
test_manifest: data/manifest.es.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-es.es_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-es.es_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.fr.train
|
||||||
|
dev_manifest: data/manifest.fr.dev
|
||||||
|
test_manifest: data/manifest.fr.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-fr.fr_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-fr.fr_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.it.train
|
||||||
|
dev_manifest: data/manifest.it.dev
|
||||||
|
test_manifest: data/manifest.it.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-it.it_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-it.it_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.nl.train
|
||||||
|
dev_manifest: data/manifest.nl.dev
|
||||||
|
test_manifest: data/manifest.nl.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-nl.nl_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-nl.nl_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.pt.train
|
||||||
|
dev_manifest: data/manifest.pt.dev
|
||||||
|
test_manifest: data/manifest.pt.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-pt.pt_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-pt.pt_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.ro.train
|
||||||
|
dev_manifest: data/manifest.ro.dev
|
||||||
|
test_manifest: data/manifest.ro.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-ro.ro_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-ro.ro_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,90 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.ru.train
|
||||||
|
dev_manifest: data/manifest.ru.dev
|
||||||
|
test_manifest: data/manifest.ru.test
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_1spm/train_sp.en-ru.ru_bpe8000_units_tc.txt
|
||||||
|
unit_type: 'spm'
|
||||||
|
spm_model_prefix: data/lang_1spm/train_sp.en-ru.ru_bpe8000_tc
|
||||||
|
mean_std_filepath: ""
|
||||||
|
# preprocess_config: conf/augmentation.json
|
||||||
|
batch_size: 20
|
||||||
|
feat_dim: 83
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
preprocess_config:
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file: None
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
asr_weight: 0.0
|
||||||
|
ctc_weight: 0.0
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 40
|
||||||
|
accum_grad: 2
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 2.5
|
||||||
|
weight_decay: 0.
|
||||||
|
scheduler: noam
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 50
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,19 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "specaug",
|
||||||
|
"params": {
|
||||||
|
"W": 5,
|
||||||
|
"warp_mode": "PIL",
|
||||||
|
"F": 30,
|
||||||
|
"n_freq_masks": 2,
|
||||||
|
"T": 40,
|
||||||
|
"n_time_masks": 2,
|
||||||
|
"p": 1.0,
|
||||||
|
"adaptive_number_ratio": 0,
|
||||||
|
"adaptive_size_ratio": 0,
|
||||||
|
"max_n_time_masks": 20,
|
||||||
|
"replace_with_zero": false
|
||||||
|
},
|
||||||
|
"prob": 1.0
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,201 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||||
|
# 2021 PaddlePaddle
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
|
set -e
|
||||||
|
set -u
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=10
|
||||||
|
|
||||||
|
# bpemode (unigram or bpe)
|
||||||
|
tgt_lang=
|
||||||
|
nbpe=8000
|
||||||
|
bpemode=bpe
|
||||||
|
must_c=
|
||||||
|
dumpdir=data/dump
|
||||||
|
do_delta=false
|
||||||
|
tgt_case=tc
|
||||||
|
src_case=lc.rm
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||||
|
|
||||||
|
TARGET_DIR=${MAIN_ROOT}/examples/dataset
|
||||||
|
mkdir -p ${TARGET_DIR}
|
||||||
|
mkdir -p data
|
||||||
|
|
||||||
|
train_set=train_sp.en-${tgt_lang}.${tgt_lang}
|
||||||
|
train_dev=dev.en-${tgt_lang}.${tgt_lang}
|
||||||
|
trans_set=""
|
||||||
|
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
|
||||||
|
trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||||
|
if [ ! -e ${must_c} ]; then
|
||||||
|
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
|
||||||
|
echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
echo "stage 0: Data Preparation"
|
||||||
|
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
|
||||||
|
local/data_prep.sh ${must_c} ${lang}
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
|
||||||
|
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
### Task dependent. You have to design training and dev sets by yourself.
|
||||||
|
### But you can utilize Kaldi recipes in most cases
|
||||||
|
echo "stage 1: Feature Generation"
|
||||||
|
fbankdir=fbank
|
||||||
|
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
|
||||||
|
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
|
||||||
|
for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
|
||||||
|
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
|
||||||
|
data/${x} data/make_fbank/${x} ${fbankdir}
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
# speed-perturbed
|
||||||
|
utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
|
||||||
|
utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
|
||||||
|
utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
|
||||||
|
utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
|
||||||
|
data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
|
||||||
|
rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
|
||||||
|
utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
|
||||||
|
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
|
||||||
|
data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
|
||||||
|
for lang in en ${tgt_lang}; do
|
||||||
|
awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
|
||||||
|
awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
|
||||||
|
awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
|
||||||
|
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
|
||||||
|
done
|
||||||
|
|
||||||
|
# Divide into source and target languages
|
||||||
|
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
|
||||||
|
local/divide_lang.sh ${x} ${tgt_lang}
|
||||||
|
done
|
||||||
|
|
||||||
|
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
|
||||||
|
# remove utt having more than 3000 frames
|
||||||
|
# remove utt having more than 400 characters
|
||||||
|
for lang in ${tgt_lang} en; do
|
||||||
|
remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
|
||||||
|
done
|
||||||
|
|
||||||
|
# Match the number of utterances between source and target languages
|
||||||
|
# extract commocn lines
|
||||||
|
cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
|
||||||
|
cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
|
||||||
|
comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
|
||||||
|
|
||||||
|
for lang in ${tgt_lang} en; do
|
||||||
|
reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
|
||||||
|
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
|
||||||
|
done
|
||||||
|
rm -rf data/${x}.*.tmp
|
||||||
|
done
|
||||||
|
|
||||||
|
# compute global CMVN
|
||||||
|
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
|
||||||
|
|
||||||
|
# dump features for training
|
||||||
|
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
|
||||||
|
utils/create_split_dir.pl \
|
||||||
|
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
|
||||||
|
${feat_tr_dir}/storage
|
||||||
|
fi
|
||||||
|
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
|
||||||
|
utils/create_split_dir.pl \
|
||||||
|
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
|
||||||
|
${feat_dt_dir}/storage
|
||||||
|
fi
|
||||||
|
dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
|
||||||
|
data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
|
||||||
|
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
|
||||||
|
data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
|
||||||
|
for ttask in ${trans_set}; do
|
||||||
|
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
|
||||||
|
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
|
||||||
|
data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
|
||||||
|
${feat_trans_dir}
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
|
||||||
|
nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
|
||||||
|
bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
|
||||||
|
echo "dictionary: ${dict}"
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
### Task dependent. You have to check non-linguistic symbols used in the corpus.
|
||||||
|
echo "stage 2: Dictionary and Json Data Preparation"
|
||||||
|
mkdir -p data/lang_1spm/
|
||||||
|
export LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
echo "make a non-linguistic symbol list for all languages"
|
||||||
|
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
|
||||||
|
cat ${nlsyms}
|
||||||
|
|
||||||
|
echo "make a joint source and target dictionary"
|
||||||
|
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
|
||||||
|
offset=$(wc -l < ${dict})
|
||||||
|
grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
|
||||||
|
grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
|
||||||
|
spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
|
||||||
|
spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
|
||||||
|
wc -l ${dict}
|
||||||
|
|
||||||
|
echo "make json files"
|
||||||
|
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
|
||||||
|
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
|
||||||
|
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
|
||||||
|
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
|
||||||
|
for ttask in ${trans_set}; do
|
||||||
|
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
|
||||||
|
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
|
||||||
|
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
|
||||||
|
done
|
||||||
|
echo "update json (add source references)"
|
||||||
|
# update json (add source references)
|
||||||
|
for x in ${train_set} ${train_dev}; do
|
||||||
|
feat_dir=${dumpdir}/${x}/delta${do_delta}
|
||||||
|
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
|
||||||
|
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
|
||||||
|
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
x=(${train_set} ${train_dev} ${trans_set})
|
||||||
|
y=(train dev test)
|
||||||
|
for (( i=0; i<${#x[*]}; ++i)); do
|
||||||
|
echo ${x[$i]} ${y[$i]}
|
||||||
|
feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
|
||||||
|
data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
|
||||||
|
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
|
||||||
|
--json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
|
||||||
|
--manifest-file data/manifest.${tgt_lang}.${y[$i]}
|
||||||
|
echo "Process done for ${y[$i]} set from ${x[$i]}"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
echo "MuST-C ${tgt_lang} Data preparation done."
|
||||||
|
exit 0
|
@ -0,0 +1,163 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||||
|
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
echo "Usage: $0 <src-dir>"
|
||||||
|
echo "e.g.: $0 /n/rd11/corpora_8/MUSTC_v1.0 target_lang"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
tgt_lang=$2
|
||||||
|
|
||||||
|
for set in train dev tst-COMMON tst-HE; do
|
||||||
|
src=$1/en-${tgt_lang}/data/${set}
|
||||||
|
dst=data/local/en-${tgt_lang}/${set}
|
||||||
|
|
||||||
|
[ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
|
||||||
|
|
||||||
|
wav_dir=${src}/wav
|
||||||
|
trans_dir=${src}/txt
|
||||||
|
yml=${trans_dir}/${set}.yaml
|
||||||
|
en=${trans_dir}/${set}.en
|
||||||
|
tgt=${trans_dir}/${set}.${tgt_lang}
|
||||||
|
|
||||||
|
mkdir -p ${dst} || exit 1;
|
||||||
|
|
||||||
|
[ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
|
||||||
|
[ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
|
||||||
|
[ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
|
||||||
|
[ ! -f ${en} ] && echo "$0: expected file ${en} to exist" && exit 1;
|
||||||
|
[ ! -f ${tgt} ] && echo "$0: expected file ${tgt} to exist" && exit 1;
|
||||||
|
|
||||||
|
wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
|
||||||
|
trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
|
||||||
|
trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
|
||||||
|
utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
|
||||||
|
spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
|
||||||
|
segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
|
||||||
|
|
||||||
|
# error check
|
||||||
|
n=$(cat ${yml} | grep duration | wc -l)
|
||||||
|
n_en=$(cat ${en} | wc -l)
|
||||||
|
n_tgt=$(cat ${tgt} | wc -l)
|
||||||
|
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
|
||||||
|
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
|
||||||
|
|
||||||
|
# (1a) Transcriptions and translations preparation
|
||||||
|
# make basic transcription file (add segments info)
|
||||||
|
cp ${yml} ${dst}/.yaml0
|
||||||
|
grep duration ${dst}/.yaml0 > ${dst}/.yaml1
|
||||||
|
awk '{
|
||||||
|
duration=$3; offset=$5; spkid=$7;
|
||||||
|
gsub(",","",duration);
|
||||||
|
gsub(",","",offset);
|
||||||
|
gsub(",","",spkid);
|
||||||
|
gsub("spk.","",spkid);
|
||||||
|
duration=sprintf("%.7f", duration);
|
||||||
|
if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
|
||||||
|
else extendt=0;
|
||||||
|
offset=sprintf("%.7f", offset);
|
||||||
|
startt=offset-extendt;
|
||||||
|
endt=offset+duration+extendt;
|
||||||
|
printf("ted_%05d_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
|
||||||
|
}' ${dst}/.yaml1 > ${dst}/.yaml2
|
||||||
|
# NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
|
||||||
|
|
||||||
|
cp ${en} ${dst}/en.org
|
||||||
|
cp ${tgt} ${dst}/${tgt_lang}.org
|
||||||
|
|
||||||
|
for lang in en ${tgt_lang}; do
|
||||||
|
# normalize punctuation
|
||||||
|
normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
|
||||||
|
|
||||||
|
# lowercasing
|
||||||
|
lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
|
||||||
|
cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
|
||||||
|
|
||||||
|
# remove punctuation
|
||||||
|
local/remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
|
||||||
|
|
||||||
|
# tokenization
|
||||||
|
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
|
||||||
|
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
|
||||||
|
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
|
||||||
|
|
||||||
|
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.tc.tok | sort > ${dst}/text.tc.${lang}
|
||||||
|
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.tok | sort > ${dst}/text.lc.${lang}
|
||||||
|
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.rm.tok | sort > ${dst}/text.lc.rm.${lang}
|
||||||
|
|
||||||
|
# save original and cleaned punctuation
|
||||||
|
lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
|
||||||
|
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
|
||||||
|
lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
|
||||||
|
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# error check
|
||||||
|
n=$(cat ${dst}/.yaml2 | wc -l)
|
||||||
|
n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
|
||||||
|
n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
|
||||||
|
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
|
||||||
|
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
# (1c) Make segments files from transcript
|
||||||
|
#segments file format is: utt-id start-time end-time, e.g.:
|
||||||
|
#ted_00001_0003501_0003684 ted_0001 003.501 0003.684
|
||||||
|
awk '{
|
||||||
|
segment=$1; split(segment,S,"[_]");
|
||||||
|
spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];
|
||||||
|
printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
|
||||||
|
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
|
||||||
|
|
||||||
|
awk '{
|
||||||
|
segment=$1; split(segment,S,"[_]");
|
||||||
|
spkid=S[1] "_" S[2];
|
||||||
|
printf("%s cat '${wav_dir}'/%s_%d.wav |\n", spkid, S[1], S[2]);
|
||||||
|
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
|
||||||
|
|
||||||
|
awk '{
|
||||||
|
segment=$1; split(segment,S,"[_]");
|
||||||
|
spkid=S[1] "_" S[2]; print $1 " " spkid
|
||||||
|
}' ${dst}/segments | uniq | sort > ${dst}/utt2spk
|
||||||
|
|
||||||
|
cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
|
||||||
|
|
||||||
|
# error check
|
||||||
|
n_en=$(cat ${dst}/text.tc.en | wc -l)
|
||||||
|
n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
|
||||||
|
[ ${n_en} -ne ${n_tgt} ] && echo "Warning: expected ${n_en} data data files, found ${n_tgt}" && exit 1;
|
||||||
|
|
||||||
|
# Copy stuff intoc its final locations [this has been moved from the format_data script]
|
||||||
|
mkdir -p data/${set}.en-${tgt_lang}
|
||||||
|
|
||||||
|
# remove duplicated utterances (the same offset)
|
||||||
|
echo "remove duplicate lines..."
|
||||||
|
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted' \
|
||||||
|
| sed 's/^[ \t]*//' > ${dst}/duplicate_lines
|
||||||
|
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted' \
|
||||||
|
| cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
|
||||||
|
reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.en-${tgt_lang}
|
||||||
|
for l in en ${tgt_lang}; do
|
||||||
|
for case in tc lc lc.rm; do
|
||||||
|
cp ${dst}/text.${case}.${l} data/${set}.en-${tgt_lang}/text.${case}.${l}
|
||||||
|
done
|
||||||
|
done
|
||||||
|
utils/fix_data_dir.sh --utt_extra_files \
|
||||||
|
"text.tc.en text.lc.en text.lc.rm.en text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" \
|
||||||
|
data/${set}.en-${tgt_lang}
|
||||||
|
|
||||||
|
# error check
|
||||||
|
n_seg=$(cat data/${set}.en-${tgt_lang}/segments | wc -l)
|
||||||
|
n_text=$(cat data/${set}.en-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
|
||||||
|
[ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data data files, found ${n_text}" && exit 1;
|
||||||
|
|
||||||
|
echo "$0: successfully prepared data in ${dst}"
|
||||||
|
done
|
@ -0,0 +1,52 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
echo "Usage: $0 <set> <lang>>"
|
||||||
|
echo "e.g.: $0 dev"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
set=$1
|
||||||
|
lang=$2
|
||||||
|
export LC_ALL=en_US.UTF-8
|
||||||
|
# Copy stuff intoc its final locations [this has been moved from the format_data script]
|
||||||
|
# for En
|
||||||
|
mkdir -p data/${set}.en
|
||||||
|
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
||||||
|
if [ -f data/${set}/${f} ]; then
|
||||||
|
sort data/${set}/${f} > data/${set}.en/${f}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text # dummy
|
||||||
|
sort data/${set}/text.tc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.tc
|
||||||
|
sort data/${set}/text.lc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc
|
||||||
|
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc.rm
|
||||||
|
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
|
||||||
|
if [ -f data/${set}.en/feats.scp ]; then
|
||||||
|
utils/validate_data_dir.sh data/${set}.en || exit 1;
|
||||||
|
else
|
||||||
|
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for target language
|
||||||
|
mkdir -p data/${set}.${lang}
|
||||||
|
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
||||||
|
if [ -f data/${set}/${f} ]; then
|
||||||
|
sort data/${set}/${f} > data/${set}.${lang}/${f}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text # dummy
|
||||||
|
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.tc
|
||||||
|
sort data/${set}/text.lc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc
|
||||||
|
sort data/${set}/text.lc.rm.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc.rm
|
||||||
|
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
|
||||||
|
if [ -f data/${set}.${lang}/feats.scp ]; then
|
||||||
|
utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
|
||||||
|
else
|
||||||
|
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
|
||||||
|
fi
|
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
binmode(STDIN,":utf8");
|
||||||
|
binmode(STDOUT,":utf8");
|
||||||
|
|
||||||
|
while(<STDIN>) {
|
||||||
|
$_ = " $_ ";
|
||||||
|
|
||||||
|
# remove punctuation except apostrophe
|
||||||
|
s/<space>/spacemark/g; # for scoring
|
||||||
|
s/'/apostrophe/g;
|
||||||
|
s/[[:punct:]]//g;
|
||||||
|
s/apostrophe/'/g;
|
||||||
|
s/spacemark/<space>/g; # for scoring
|
||||||
|
|
||||||
|
# remove whitespace
|
||||||
|
s/\s+/ /g;
|
||||||
|
s/^\s+//;
|
||||||
|
s/\s+$//;
|
||||||
|
|
||||||
|
print "$_\n";
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
if [ $# != 4 ];then
|
||||||
|
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix lang"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
decode_config_path=$2
|
||||||
|
ckpt_prefix=$3
|
||||||
|
tgt_lang=$4
|
||||||
|
|
||||||
|
for type in fullsentence; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
python3 -u ${BIN_DIR}/test.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo $PATH
|
||||||
|
python3 ${MAIN_ROOT}/utils/rsl2trn.py --rsl ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--hyp ${ckpt_prefix}.${type}.hyp \
|
||||||
|
--ref ${ckpt_prefix}.${type}.ref
|
||||||
|
if ! which tokenizer.perl > /dev/null; then
|
||||||
|
echo "Error: it seems that moses is not installed." >&2
|
||||||
|
echo "Error: please install moses as follows." >&2
|
||||||
|
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.hyp > ${ckpt_prefix}.${type}.hyp.detok
|
||||||
|
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.ref > ${ckpt_prefix}.${type}.ref.detok
|
||||||
|
echo "Detokenized BLEU:"
|
||||||
|
sacrebleu ${ckpt_prefix}.${type}.ref.detok -i ${ckpt_prefix}.${type}.hyp.detok
|
||||||
|
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# != 3 ];then
|
||||||
|
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
ckpt_name=$2
|
||||||
|
ckpt_path=$3
|
||||||
|
|
||||||
|
|
||||||
|
mkdir -p exp
|
||||||
|
|
||||||
|
# seed may break model convergence
|
||||||
|
seed=0
|
||||||
|
if [ ${seed} != 0 ]; then
|
||||||
|
export FLAGS_cudnn_deterministic=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 -u ${BIN_DIR}/train.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--output exp/${ckpt_name} \
|
||||||
|
--checkpoint_path "${ckpt_path}" \
|
||||||
|
--seed ${seed}
|
||||||
|
|
||||||
|
if [ ${seed} != 0 ]; then
|
||||||
|
unset FLAGS_cudnn_deterministic
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in training!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,29 @@
|
|||||||
|
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${MAIN_ROOT}/tools/moses/scripts/tokenizer:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
export PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||||
|
|
||||||
|
if ! which tokenizer.perl > /dev/null; then
|
||||||
|
echo "Error: moses is required in this example." >&2
|
||||||
|
echo "Error: it seems that moses is not installed." >&2
|
||||||
|
echo "Error: please install moses as follows." >&2
|
||||||
|
echo "Error: cd ${MAIN_ROOT}/tools && git clone https://github.com/moses-smt/mosesdecoder.git moses" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
MODEL=u2_st
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
|
||||||
|
|
||||||
|
# Kaldi
|
||||||
|
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
|
||||||
|
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
|
||||||
|
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
|
||||||
|
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
|
||||||
|
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
|
@ -0,0 +1,39 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
. ./path.sh || exit 1;
|
||||||
|
. ./cmd.sh || exit 1;
|
||||||
|
|
||||||
|
gpus=0,1,2,3
|
||||||
|
stage=0
|
||||||
|
stop_stage=3
|
||||||
|
conf_path=conf/transformer_es.yaml
|
||||||
|
decode_conf_path=conf/tuning/decode.yaml
|
||||||
|
must_c_path=
|
||||||
|
lang=es
|
||||||
|
avg_num=5
|
||||||
|
ckpt_path= # (finetune from FAT-ST or ASR pretrained model)
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
avg_ckpt=avg_${avg_num}
|
||||||
|
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||||
|
echo "checkpoint name ${ckpt}"
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
bash ./local/data.sh --tgt_lang ${lang} --must_c ${must_c_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `exp` dir
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# avg n best model
|
||||||
|
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# test ckpt avg_n
|
||||||
|
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
|
||||||
|
fi
|
@ -0,0 +1 @@
|
|||||||
|
../../../tools/kaldi/egs/wsj/s5/steps
|
@ -0,0 +1 @@
|
|||||||
|
../../../tools/kaldi/egs/wsj/s5/utils
|
@ -0,0 +1,33 @@
|
|||||||
|
# PaddleSpeech Server Command Line
|
||||||
|
|
||||||
|
([简体中文](./README_cn.md)|English)
|
||||||
|
|
||||||
|
The simplest approach to use PaddleSpeech Server including server and client.
|
||||||
|
|
||||||
|
## PaddleSpeech Server
|
||||||
|
### Help
|
||||||
|
```bash
|
||||||
|
paddlespeech_server help
|
||||||
|
```
|
||||||
|
### Start the server
|
||||||
|
First set the service-related configuration parameters, similar to `./conf/application.yaml`,
|
||||||
|
Then start the service:
|
||||||
|
```bash
|
||||||
|
paddlespeech_server start --config_file ./conf/application.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## PaddleSpeech Client
|
||||||
|
### Help
|
||||||
|
```bash
|
||||||
|
paddlespeech_client help
|
||||||
|
```
|
||||||
|
### Access speech recognition services
|
||||||
|
```
|
||||||
|
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./tests/16_audio.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
### Access text to speech services
|
||||||
|
```bash
|
||||||
|
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,32 @@
|
|||||||
|
# PaddleSpeech Server 命令行工具
|
||||||
|
|
||||||
|
(简体中文|[English](./README.md))
|
||||||
|
|
||||||
|
它提供了最简便的方式调用 PaddleSpeech 语音服务用一行命令就可以轻松启动服务和调用服务。
|
||||||
|
|
||||||
|
## 服务端命令行使用
|
||||||
|
### 帮助
|
||||||
|
```bash
|
||||||
|
paddlespeech_server help
|
||||||
|
```
|
||||||
|
### 启动服务
|
||||||
|
首先设置服务相关配置文件,类似于 `./conf/application.yaml`,同时设置服务配置中的语音任务模型相关配置,类似于 `./conf/tts/tts.yaml`。
|
||||||
|
然后启动服务:
|
||||||
|
```bash
|
||||||
|
paddlespeech_server start --config_file ./conf/application.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## 客户端命令行使用
|
||||||
|
### 帮助
|
||||||
|
```bash
|
||||||
|
paddlespeech_client help
|
||||||
|
```
|
||||||
|
### 访问语音识别服务
|
||||||
|
```
|
||||||
|
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
### 访问语音合成服务
|
||||||
|
```bash
|
||||||
|
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
|
||||||
|
```
|
@ -0,0 +1,38 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from abc import ABC
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
class BaseExecutor(ABC):
|
||||||
|
"""
|
||||||
|
An abstract executor of paddlespeech server tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def execute(self, argv: List[str]) -> bool:
|
||||||
|
"""
|
||||||
|
Command line entry. This method can only be accessed by a command line such as `paddlespeech asr`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
argv (List[str]): Arguments from command line.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: Result of the command execution. `True` for a success and `False` for a failure.
|
||||||
|
"""
|
||||||
|
pass
|
Binary file not shown.
@ -0,0 +1,59 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
|
||||||
|
def readwav2base64(wav_file):
|
||||||
|
"""
|
||||||
|
read wave file and covert to base64 string
|
||||||
|
"""
|
||||||
|
with open(wav_file, 'rb') as f:
|
||||||
|
base64_bytes = base64.b64encode(f.read())
|
||||||
|
base64_string = base64_bytes.decode('utf-8')
|
||||||
|
return base64_string
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
main func
|
||||||
|
"""
|
||||||
|
url = "http://127.0.0.1:8090/paddlespeech/asr"
|
||||||
|
|
||||||
|
# start Timestamp
|
||||||
|
time_start=time.time()
|
||||||
|
|
||||||
|
test_audio_dir = "./16_audio.wav"
|
||||||
|
audio = readwav2base64(test_audio_dir)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"audio": audio,
|
||||||
|
"audio_format": "wav",
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"lang": "zh_cn",
|
||||||
|
}
|
||||||
|
|
||||||
|
r = requests.post(url=url, data=json.dumps(data))
|
||||||
|
|
||||||
|
# ending Timestamp
|
||||||
|
time_end=time.time()
|
||||||
|
print('time cost',time_end - time_start, 's')
|
||||||
|
|
||||||
|
print(r.json())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in new issue