filter example; cmvn stride and window int; libri/s1 conf

pull/1012/head
Hui Zhang 3 years ago
parent 18d9abc7a0
commit 44743622d4

@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \
--stride_ms=10 \
--window_ms=20 \
--sample_rate=16000 \
--use_dB_normalization=True \
--num_samples=2000 \

@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \

@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=8000 \
--use_dB_normalization=False \
--num_samples=-1 \

@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
print("Creating manifest %s ..." % manifest_path)
json_lines = []
total_sec = 0.0
total_text = 0.0
total_char = 0.0
total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)):
@ -89,7 +89,7 @@ def create_manifest(data_dir, manifest_path):
text_filepath = os.path.join(subfolder, text_filelist[0])
for line in io.open(text_filepath, encoding="utf8"):
segments = line.strip().split()
n_token = len(segments[1:])
nchars = len(segments[1:])
text = ' '.join(segments[1:]).lower()
audio_filepath = os.path.abspath(
@ -110,7 +110,7 @@ def create_manifest(data_dir, manifest_path):
}))
total_sec += duration
total_text += n_token
total_char += nchars
total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@ -125,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_char} char", file=f)
print(f"{total_char / total_sec} char/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)

@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=20.0 \
--stride_ms=10 \
--window_ms=20 \
--use_dB_normalization=True \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"

@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer

@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer

@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer

@ -0,0 +1,29 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer

@ -8,6 +8,11 @@ nbpe=5000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
stride_ms=10
window_ms=25
sample_rate=16000
feat_dim=80
source ${MAIN_ROOT}/utils/parse_options.sh
@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1
fi
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${set} data/manifest.${set}.raw
for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${sub} data/manifest.${sub}.raw
done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for set in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw
for sub in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${sub}.raw >> data/manifest.train.raw
done
for set in dev-clean dev-other; do
cat data/manifest.${set}.raw >> data/manifest.dev.raw
for sub in dev-clean dev-other; do
cat data/manifest.${sub}.raw >> data/manifest.dev.raw
done
for set in test-clean test-other; do
cat data/manifest.${set}.raw >> data/manifest.test.raw
for sub in test-clean test-other; do
cat data/manifest.${sub}.raw >> data/manifest.test.raw
done
fi
@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--spectrum_type="fbank" \
--feat_dim=80 \
--feat_dim=${feat_dim} \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--sample_rate=${sample_rate} \
--stride_ms=${stride_ms} \
--window_ms=${window_ms} \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
@ -85,15 +90,15 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test dev-clean dev-other test-clean test-other; do
for sub in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
--manifest_path="data/manifest.${sub}.raw" \
--output_path="data/manifest.${sub}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
@ -102,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
}&
done
wait
for sub in train dev; do
mv data/manifest.${sub} data/manifest.${sub}.fmt
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for sub in train dev; do
remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
done
fi
echo "LibriSpeech Data preparation done."

@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"

@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"

@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=20.0 \
--stride_ms=10 \
--window_ms=20 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"

@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"

@ -33,8 +33,8 @@ add_arg('spectrum_type', str,
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
add_arg('stride_ms', float, 10.0, "stride length in ms.")
add_arg('window_ms', float, 20.0, "stride length in ms.")
add_arg('stride_ms', int, 10, "stride length in ms.")
add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
@ -61,8 +61,8 @@ def main():
spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta,
stride_ms=args.stride_ms,
window_ms=args.window_ms,
stride_ms=float(args.stride_ms),
window_ms=float(args.window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=args.sample_rate,

@ -122,7 +122,7 @@ def main():
fout.write(json.dumps(output_json) + '\n')
count += 1
print(f"Examples number: {count}")
print(f"{args.manifest_paths} Examples number: {count}")
fout.close()

@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""remove longshort data from manifest"""
import logging
import argparse
import jsonlines
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
# manifest after format
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
def get_parser():
parser = argparse.ArgumentParser(
description="remove longshort data from format manifest",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--iaxis", default=0, type=int, help="multi inputs index, 0 is the first")
parser.add_argument(
"--oaxis", default=0, type=int, help="multi outputs index, 0 is the first")
parser.add_argument(
"--maxframes", default=2000, type=int, help="maxframes")
parser.add_argument(
"--minframes", default=10, type=int, help="minframes")
parser.add_argument(
"--maxchars", default=200, type=int, help="max tokens")
parser.add_argument(
"--minchars", default=0, type=int, help="min tokens")
parser.add_argument(
"--stride_ms", default=10, type=int, help="stride in ms unit.")
parser.add_argument(
"rspecifier",
type=str,
help="jsonl format manifest. e.g. manifest.jsonl")
parser.add_argument(
"wspecifier_or_wxfilename",
type=str,
help="Write specifier. e.g. manifest.jsonl")
return parser
def filter_input(args, line):
tmp = line['input'][args.iaxis]
if args.sound:
# second to frame
nframe = tmp['shape'][0] * 1000 / args.stride_ms
else:
nframe = tmp['shape'][0]
if nframe < args.minframes or nframe > args.maxframes:
return True
else:
return False
def filter_output(args, line):
nchars = len(line['output'][args.iaxis]['text'])
if nchars < args.minchars or nchars > args.maxchars:
return True
else:
return False
def main():
args = get_parser().parse_args()
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
with jsonlines.open(args.rspecifier, 'r') as reader:
lines = list(reader)
logging.info(f"Example: {len(lines)}")
feat = lines[0]['input'][args.iaxis]['feat']
args.soud = False
if feat.split('.')[-1] not in 'ark, scp':
args.sound = True
count = 0
filter = 0
with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
for line in lines:
if filter_input(args, line) or filter_output(args, line):
filter += 1
continue
writer.write(line)
count += 1
logging.info(f"Example after filter: {count}\{filter}")
if __name__ == '__main__':
main()
Loading…
Cancel
Save