Merge pull request #1012 from zh794390558/datapipe

[asr] independent dataloader
pull/1019/head
Hui Zhang 3 years ago committed by GitHub
commit 6750770e54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,7 +1,9 @@
# ASR # ASR
* s0 for deepspeech2 * asr0 - deepspeech2 Streaming/Non-Streaming
* s1 for u2/transformer/conformer * asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
## Data ## Data

@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=20.0 \ --window_ms=20 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=True \ --use_dB_normalization=True \
--num_samples=2000 \ --num_samples=2000 \
@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do for dataset in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -0,0 +1,29 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -0,0 +1,112 @@
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.5
max_input_len: 20.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training:
n_epoch: 120
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1e-6
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=25.0 \ --window_ms=25 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_samples=-1 \ --num_samples=-1 \
@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do for dataset in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -23,8 +23,6 @@ fi
# exit 1 # exit 1
#fi #fi
for type in attention_rescoring; do for type in attention_rescoring; do
echo "decoding ${type}" echo "decoding ${type}"
batch_size=1 batch_size=1

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -0,0 +1,29 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=25.0 \ --window_ms=25 \
--sample_rate=8000 \ --sample_rate=8000 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_samples=-1 \ --num_samples=-1 \
@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do for dataset in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -22,6 +22,7 @@ import argparse
import codecs import codecs
import json import json
import os import os
from pathlib import Path
import soundfile import soundfile
@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_path = os.path.abspath(os.path.join(subfolder, fname)) audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4] audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': text, 'text': text,

@ -22,6 +22,7 @@ import argparse
import codecs import codecs
import json import json
import os import os
from pathlib import Path
import soundfile import soundfile
@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
# if no transcription for audio then skipped # if no transcription for audio then skipped
if audio_id not in transcript_dict: if audio_id not in transcript_dict:
continue continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id] text = transcript_dict[audio_id]
@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': text 'text': text

@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
print("Creating manifest %s ..." % manifest_path) print("Creating manifest %s ..." % manifest_path)
json_lines = [] json_lines = []
total_sec = 0.0 total_sec = 0.0
total_text = 0.0 total_char = 0.0
total_num = 0 total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)): for subfolder, _, filelist in sorted(os.walk(data_dir)):
@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
text_filepath = os.path.join(subfolder, text_filelist[0]) text_filepath = os.path.join(subfolder, text_filelist[0])
for line in io.open(text_filepath, encoding="utf8"): for line in io.open(text_filepath, encoding="utf8"):
segments = line.strip().split() segments = line.strip().split()
nchars = len(segments[1:])
text = ' '.join(segments[1:]).lower() text = ' '.join(segments[1:]).lower()
audio_filepath = os.path.abspath( audio_filepath = os.path.abspath(
os.path.join(subfolder, segments[0] + '.flac')) os.path.join(subfolder, segments[0] + '.flac'))
audio_data, samplerate = soundfile.read(audio_filepath) audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(audio_filepath))[0]
utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append( json_lines.append(
json.dumps({ json.dumps({
'utt': 'utt': utt,
os.path.splitext(os.path.basename(audio_filepath))[0], 'utt2spk': utt2spk,
'feat': 'feat': audio_filepath,
audio_filepath, 'feat_shape': (duration, ), # second
'feat_shape': (duration, ), #second 'text': text,
'text':
text
})) }))
total_sec += duration total_sec += duration
total_text += len(text) total_char += nchars
total_num += 1 total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file: with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
print(f"{subset}:", file=f) print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f) print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f) print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f) print(f"{total_char} char", file=f)
print(f"{total_text / total_sec} text/sec", file=f) print(f"{total_char / total_sec} char/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f) print(f"{total_sec / total_num} sec/utt", file=f)

@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
audio_filepath = os.path.join(subfolder, segments[0] + '.flac') audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
audio_data, samplerate = soundfile.read(audio_filepath) audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(audio_filepath))[0]
utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append( json_lines.append(
json.dumps({ json.dumps({
'utt': 'utt': utt,
os.path.splitext(os.path.basename(audio_filepath))[0], 'utt2spk': utt2spk,
'feat': 'feat': audio_filepath,
audio_filepath,
'feat_shape': (duration, ), #second 'feat_shape': (duration, ), #second
'text': 'text': text,
text
})) }))
total_sec += duration total_sec += duration

@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
continue continue
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
translation_str = " ".join(translation.split())
trancription_str = " ".join(trancription.split())
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'utt': utt, 'utt': utt,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': " ".join(translation.split()), 'text': [translation_str, trancription_str],
'text1': " ".join(trancription.split())
}, },
ensure_ascii=False)) ensure_ascii=False))

@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
assert os.path.exists(audio_path) and os.path.exists(text_path) assert os.path.exists(audio_path) and os.path.exists(text_path)
audio_id = os.path.basename(audio_path)[:-4] audio_id = os.path.basename(audio_path)[:-4]
spk = audio_id.split('_')[0]
word_text, syllable_text, phone_text = read_trn(text_path) word_text, syllable_text, phone_text = read_trn(text_path)
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk', spk,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': word_text, # charactor 'text': word_text, # charactor

@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': utt_id, 'utt': utt_id,
'utt2spk': spk,
'utt2gender': gender,
'feat': str(audio_path), 'feat': str(audio_path),
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': word_text, # word 'text': word_text, # word
'phone': phone_text, 'phone': phone_text,
'spk': spk,
'gender': gender,
}, },
ensure_ascii=False)) ensure_ascii=False))

@ -24,6 +24,7 @@ import json
import os import os
import soundfile import soundfile
from pathlib import Path
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
text = phn_dict[audio_id] text = phn_dict[audio_id]
gender_spk = str(Path(audio_path).parent.stem)
spk = gender_spk[1:]
gender = gender_spk[0]
utt_id = '_'.join([spk, gender, audio_id])
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': spk,
'utt2gender': gender,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': text 'text': text

@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
audio_data, samplerate = soundfile.read(u) audio_data, samplerate = soundfile.read(u)
duration = float(len(audio_data)) / samplerate duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(u))[0]
json_lines.append( json_lines.append(
json.dumps({ json.dumps({
'utt': os.path.splitext(os.path.basename(u))[0], 'utt': utt,
'utt2spk': speaker,
'feat': u, 'feat': u,
'feat_shape': (duration, ), #second 'feat_shape': (duration, ), #second
'text': trans.lower() 'text': trans.lower()

@ -1,8 +1,9 @@
# ASR # ASR
* s0 is for deepspeech2 offline * asr0 - deepspeech2 Streaming/Non-Streaming
* s1 is for transformer/conformer/U2 * asr1 - transformer/conformer Streaming/Non-Streaming
* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi * asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
## Data ## Data
| Data Subset | Duration in Seconds | | Data Subset | Duration in Seconds |

@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=20.0 \ --window_ms=20 \
--use_dB_normalization=True \ --use_dB_normalization=True \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do for set in train dev test dev-clean dev-other test-clean test-other; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \ --unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -21,7 +21,7 @@
## Transformer ## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: transformer encoder: transformer

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -0,0 +1,25 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: transformer encoder: transformer

@ -8,6 +8,11 @@ nbpe=5000
bpemode=unigram bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}" bpeprefix="data/bpe_${bpemode}_${nbpe}"
stride_ms=10
window_ms=25
sample_rate=16000
feat_dim=80
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1 exit 1
fi fi
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${set} data/manifest.${set}.raw mv data/manifest.${sub} data/manifest.${sub}.raw
done done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for set in train-clean-100 train-clean-360 train-other-500; do for sub in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw cat data/manifest.${sub}.raw >> data/manifest.train.raw
done done
for set in dev-clean dev-other; do for sub in dev-clean dev-other; do
cat data/manifest.${set}.raw >> data/manifest.dev.raw cat data/manifest.${sub}.raw >> data/manifest.dev.raw
done done
for set in test-clean test-other; do for sub in test-clean test-other; do
cat data/manifest.${set}.raw >> data/manifest.test.raw cat data/manifest.${sub}.raw >> data/manifest.test.raw
done done
fi fi
@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--spectrum_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=${feat_dim} \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=${sample_rate} \
--stride_ms=10.0 \ --stride_ms=${stride_ms} \
--window_ms=25.0 \ --window_ms=${window_ms} \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
@ -85,16 +90,15 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size # format manifest with tokenids, vocab size
for set in train dev test dev-clean dev-other test-clean test-other; do for sub in train dev test dev-clean dev-other test-clean test-other; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "spm" \ --unit_type "spm" \
--spm_model_prefix ${bpeprefix} \ --spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \ --manifest_path="data/manifest.${sub}.raw" \
--output_path="data/manifest.${set}" --output_path="data/manifest.${sub}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated." echo "Formt mnaifest failed. Terminated."
@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
}& }&
done done
wait wait
for sub in train dev; do
mv data/manifest.${sub} data/manifest.${sub}.fmt
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for sub in train dev; do
remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
done
fi fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save