filter example; cmvn stride and window int; libri/s1 conf

5 years ago · 44743622d4
parent 18d9abc7a0
commit 44743622d4
18 changed files with 195 additions and 49 deletions
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --spectrum_type="linear" \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --sample_rate=16000 \
    --use_dB_normalization=True \
    --num_samples=2000 \
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=8000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
    total_num = 0

    for subfolder, _, filelist in sorted(os.walk(data_dir)):
@ -89,7 +89,7 @@ def create_manifest(data_dir, manifest_path):
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
-                n_token = len(segments[1:])
+                nchars = len(segments[1:])
                text = ' '.join(segments[1:]).lower()

                audio_filepath = os.path.abspath(
@ -110,7 +110,7 @@ def create_manifest(data_dir, manifest_path):
                    }))

                total_sec += duration
-                total_text += n_token
+                total_char += nchars
                total_num += 1

    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@ -125,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
        print(f"{subset}:", file=f)
        print(f"{total_num} utts", file=f)
        print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)


--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --use_dB_normalization=True \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/librispeech/s1/conf/preprocess.yaml
+++ b/examples/librispeech/s1/conf/preprocess.yaml
@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"

+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
 source ${MAIN_ROOT}/utils/parse_options.sh


@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi

-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${sub} data/manifest.${sub}.raw
    done

    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    for sub in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
    done

-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    for sub in dev-clean dev-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
    done

-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    for sub in test-clean test-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
    done
 fi

@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
    --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@ -85,15 +90,15 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"

        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
@ -102,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    }&
    done
    wait
+
+    for sub in train dev; do
+        mv data/manifest.${sub} data/manifest.${sub}.fmt
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    for sub in train dev; do
+        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+    done
 fi

 echo "LibriSpeech Data preparation done."
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --use_dB_normalization=False \
    --num_workers=2 \
    --output_path="data/mean_std.json"
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --use_dB_normalization=False \
    --num_workers=2 \
    --output_path="data/mean_std.json"
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@ -33,8 +33,8 @@ add_arg('spectrum_type',    str,
        choices=['linear', 'mfcc', 'fbank'])
 add_arg('feat_dim',    int, 13, "Audio feature dim.")
 add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', float, 10.0,  "stride length in ms.")
-add_arg('window_ms', float, 20.0,  "stride length in ms.")
+add_arg('stride_ms', int, 10,  "stride length in ms.")
+add_arg('window_ms', int, 20,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
 add_arg('use_dB_normalization', bool, True, "do dB normalization.")
 add_arg('target_dB',   int, -20,  "target dB.")
@ -61,8 +61,8 @@ def main():
        spectrum_type=args.spectrum_type,
        feat_dim=args.feat_dim,
        delta_delta=args.delta_delta,
-        stride_ms=args.stride_ms,
-        window_ms=args.window_ms,
+        stride_ms=float(args.stride_ms),
+        window_ms=float(args.window_ms),
        n_fft=None,
        max_freq=None,
        target_sample_rate=args.sample_rate,
--- a/utils/format_data.py
+++ b/utils/format_data.py
@ -122,7 +122,7 @@ def main():
            fout.write(json.dumps(output_json) + '\n')
            count += 1

-    print(f"Examples number: {count}")
+    print(f"{args.manifest_paths} Examples number: {count}")
    fout.close()


--- a/utils/remove_longshortdata.py
+++ b/utils/remove_longshortdata.py
@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""remove longshort data from manifest"""
+import logging
+import argparse
+import jsonlines
+
+from paddlespeech.s2t.utils.cli_utils import get_commandline_args
+
+# manifest after format
+# josnline like this
+# {
+#   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+#   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+#   "utt2spk": "111-2222",
+#   "utt": "111-2222-333"
+# }
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="remove longshort data from format manifest",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--iaxis", default=0, type=int, help="multi inputs index, 0 is the first")
+    parser.add_argument(
+        "--oaxis", default=0, type=int, help="multi outputs index, 0 is the first")
+    parser.add_argument(
+        "--maxframes", default=2000, type=int, help="maxframes")
+    parser.add_argument(
+        "--minframes", default=10, type=int, help="minframes")
+    parser.add_argument(
+        "--maxchars", default=200, type=int, help="max tokens")
+    parser.add_argument(
+        "--minchars", default=0, type=int, help="min tokens")
+    parser.add_argument(
+        "--stride_ms", default=10, type=int, help="stride in ms unit.")
+    parser.add_argument(
+        "rspecifier",
+        type=str,
+        help="jsonl format manifest. e.g. manifest.jsonl")
+    parser.add_argument(
+        "wspecifier_or_wxfilename",
+        type=str,
+        help="Write specifier. e.g. manifest.jsonl")
+    return parser
+
+
+def filter_input(args, line):
+    tmp = line['input'][args.iaxis]
+    if args.sound:
+        # second to frame
+        nframe = tmp['shape'][0] * 1000 / args.stride_ms
+    else:
+        nframe = tmp['shape'][0]
+   
+    if nframe < args.minframes or nframe > args.maxframes:
+        return True
+    else:
+        return False
+
+
+def filter_output(args, line):
+    nchars = len(line['output'][args.iaxis]['text'])
+    if nchars < args.minchars or nchars > args.maxchars:
+        return True
+    else:
+        return False
+    
+
+def main():
+    args = get_parser().parse_args()
+
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+    
+    with jsonlines.open(args.rspecifier, 'r') as reader:
+        lines = list(reader)
+    logging.info(f"Example: {len(lines)}")
+    feat = lines[0]['input'][args.iaxis]['feat']
+    args.soud  = False
+    if feat.split('.')[-1] not in 'ark, scp':
+        args.sound = True
+    
+    count = 0
+    filter = 0
+    with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
+        for line in lines:
+            if filter_input(args, line) or filter_output(args, line):
+                filter += 1
+                continue
+            writer.write(line)
+            count += 1
+    logging.info(f"Example after filter: {count}\{filter}")
+
+if __name__ == '__main__':
+    main()