From 69055698a2f2af98ac147e02f25f55c51a6a5b2e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 03:14:42 +0000 Subject: [PATCH 01/25] transformer using batch data loader --- examples/aishell/s0/local/data.sh | 1 - examples/aishell/s1/local/data.sh | 1 - examples/callcenter/s1/local/data.sh | 1 - examples/dataset/librispeech/librispeech.py | 19 +- examples/librispeech/s0/local/data.sh | 1 - examples/librispeech/s1/local/data.sh | 1 - examples/other/1xt2x/aishell/local/data.sh | 1 - examples/other/1xt2x/baidu_en8k/local/data.sh | 1 - .../other/1xt2x/librispeech/local/data.sh | 1 - examples/ted_en_zh/t0/local/data.sh | 1 - examples/timit/s1/local/data.sh | 1 - examples/tiny/s0/local/data.sh | 1 - examples/tiny/s1/conf/chunk_confermer.yaml | 2 +- examples/tiny/s1/conf/chunk_transformer.yaml | 2 +- examples/tiny/s1/conf/conformer.yaml | 2 +- examples/tiny/s1/conf/preprocess.yaml | 27 +++ examples/tiny/s1/conf/transformer.yaml | 2 +- examples/tiny/s1/local/data.sh | 1 - paddlespeech/s2t/exps/u2/model.py | 195 ++++++++++-------- paddlespeech/s2t/exps/u2_kaldi/model.py | 5 +- paddlespeech/s2t/io/dataset.py | 38 +--- paddlespeech/s2t/io/reader.py | 19 +- paddlespeech/s2t/io/utility.py | 19 +- paddlespeech/s2t/transform/spectrogram.py | 83 ++++++++ paddlespeech/s2t/transform/transformation.py | 1 + utils/format_data.py | 68 ++++-- utils/format_triplet_data.py | 6 +- 27 files changed, 328 insertions(+), 172 deletions(-) create mode 100644 examples/tiny/s1/conf/preprocess.yaml diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index f4fccbe6..d0a63dca 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 2b9f69ae..8124d1bb 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index 634bb8d0..65e6e5fc 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index e85bbb3a..0d535e13 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path): text_filepath = os.path.join(subfolder, text_filelist[0]) for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() + n_token = len(segments[1:]) text = ' '.join(segments[1:]).lower() audio_filepath = os.path.abspath( os.path.join(subfolder, segments[0] + '.flac')) audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) + json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, - 'feat_shape': (duration, ), #second - 'text': - text + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, + 'feat_shape': (duration, ), # second + 'text': text, })) total_sec += duration - total_text += len(text) + total_text += n_token total_num += 1 with codecs.open(manifest_path, 'w', 'utf-8') as out_file: diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index fd2b0c01..78a4ffc4 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 56fec846..b15ddce5 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -88,7 +88,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh index 0bf35e1f..85574260 100755 --- a/examples/other/1xt2x/aishell/local/data.sh +++ b/examples/other/1xt2x/aishell/local/data.sh @@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh index f0bde77f..8e378ff0 100755 --- a/examples/other/1xt2x/baidu_en8k/local/data.sh +++ b/examples/other/1xt2x/baidu_en8k/local/data.sh @@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh index 6f9bc556..7387472d 100755 --- a/examples/other/1xt2x/librispeech/local/data.sh +++ b/examples/other/1xt2x/librispeech/local/data.sh @@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index b080a5b4..23e5a9c7 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -89,7 +89,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test; do { python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index ad4ddde3..66be39e2 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index 711ebee4..bcf9e6d1 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -63,7 +63,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index c5186669..6bed27f5 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 29c30b26..7aed1b19 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 8487da77..2c09b3ae 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/preprocess.yaml b/examples/tiny/s1/conf/preprocess.yaml new file mode 100644 index 00000000..9de0d8c7 --- /dev/null +++ b/examples/tiny/s1/conf/preprocess.yaml @@ -0,0 +1,27 @@ +process: + # extract kaldi fbank from PCM + - type: "fbank_kaldi" + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + # these three processes are a.k.a. SpecAugument + - type: "time_warp" + max_time_warp: 5 + inplace: true + mode: "PIL" + - type: "freq_mask" + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: "time_mask" + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index cc9b5c51..87f9c243 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index b25f993f..3d7f19ab 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -69,7 +69,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 67441fae..6b23a985 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -27,7 +27,9 @@ from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.sampler import SortagradBatchSampler from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler @@ -247,92 +249,103 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() - config.defrost() - config.collator.keep_transcription_text = False - # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest - train_dataset = ManifestDataset.from_config(config) - - config.data.manifest = config.data.dev_manifest - dev_dataset = ManifestDataset.from_config(config) - - collate_fn_train = SpeechCollator.from_config(config) - - config.collator.augmentation_config = "" - collate_fn_dev = SpeechCollator.from_config(config) - - if self.parallel: - batch_sampler = SortagradDistributedBatchSampler( - train_dataset, + if self.train: + # train/valid dataset, return token ids + self.train_loader = BatchDataLoader( + json_file=config.data.train_manifest, + train_mode=True, + sortagrad=False, batch_size=config.collator.batch_size, - num_replicas=None, - rank=None, - shuffle=True, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - else: - batch_sampler = SortagradBatchSampler( - train_dataset, - shuffle=True, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + + self.valid_loader = BatchDataLoader( + json_file=config.data.dev_manifest, + train_mode=False, + sortagrad=False, batch_size=config.collator.batch_size, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - self.train_loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config.collator.batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) - - # test dataset, return raw text - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') - - test_dataset = ManifestDataset.from_config(config) - # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - # return text token id - config.collator.keep_transcription_text = False - self.align_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - logger.info("Setup train/valid/test/align Dataloader!") + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + logger.info("Setup train/valid Dataloader!") + else: + # test dataset, return raw text + self.test_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + + self.align_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + logger.info("Setup test/align Dataloader!") def setup_model(self): config = self.config model_conf = config.model with UpdateConfig(model_conf): - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size model = U2Model.from_config(model_conf) @@ -341,6 +354,11 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) + self.model = model + logger.info("Setup model!") + + if not self.train: + return train_config = config.training optim_type = train_config.optim @@ -381,10 +399,9 @@ class U2Trainer(Trainer): optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) - self.model = model self.optimizer = optimizer self.lr_scheduler = lr_scheduler - logger.info("Setup model/optimizer/lr_scheduler!") + logger.info("Setup optimizer/lr_scheduler!") class U2Tester(U2Trainer): @@ -419,14 +436,19 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.collator.unit_type, + vocab_filepath=self.config.collator.vocab_filepath, + spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list - def ordid2token(self, texts, texts_len): + def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ trans = [] for text, n in zip(texts, texts_len): n = n.numpy().item() ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) + trans.append(text_feature.defeaturize(ids.numpy().tolist())) return trans def compute_metrics(self, @@ -442,12 +464,11 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = self.test_loader.collate_fn.text_feature - target_transcripts = self.ordid2token(texts, texts_len) + target_transcripts = self.id2token(texts, texts_len, self.text_feature) result_transcripts, result_tokenids = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -497,7 +518,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.collate_fn.stride_ms + stride_ms = self.config.collator.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -556,8 +577,8 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align( self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + self.config.collator.stride_ms, + self.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index e37784aa..357a39b9 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -392,6 +392,7 @@ class U2Tester(U2Trainer): unit_type=self.config.collator.unit_type, vocab_filepath=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ @@ -529,8 +530,8 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align( self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + self.config.collator.stride_ms, + self.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 7c96cb43..7007518d 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -207,34 +207,16 @@ class AudioDataset(Dataset): if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: - assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark', - '.scp') - data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms)) + path_suffix = data[0]['feat'].split(':')[0].splitext()[-1] + assert path_suffix not in ('.ark', '.scp') + # m second to n frame + data = list( + map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms), + data)) self.input_dim = data[0]['feat_shape'][1] self.output_dim = data[0]['token_shape'][1] - # with open(data_file, 'r') as f: - # for line in f: - # arr = line.strip().split('\t') - # if len(arr) != 7: - # continue - # key = arr[0].split(':')[1] - # tokenid = arr[5].split(':')[1] - # output_dim = int(arr[6].split(':')[1].split(',')[1]) - # if raw_wav: - # wav_path = ':'.join(arr[1].split(':')[1:]) - # duration = int(float(arr[2].split(':')[1]) * 1000 / 10) - # data.append((key, wav_path, duration, tokenid)) - # else: - # feat_ark = ':'.join(arr[1].split(':')[1:]) - # feat_info = arr[2].split(':')[1].split(',') - # feat_dim = int(feat_info[1].strip()) - # num_frames = int(feat_info[0].strip()) - # data.append((key, feat_ark, num_frames, tokenid)) - # self.input_dim = feat_dim - # self.output_dim = output_dim - valid_data = [] for i in range(len(data)): length = data[i]['feat_shape'][0] @@ -242,17 +224,17 @@ class AudioDataset(Dataset): # remove too lang or too short utt for both input and output # to prevent from out of memory if length > max_length or length < min_length: - # logging.warn('ignore utterance {} feature {}'.format( - # data[i][0], length)) pass elif token_length > token_max_length or token_length < token_min_length: pass else: valid_data.append(data[i]) + logger.info(f"raw dataset len: {len(data)}") data = valid_data + num_data = len(data) + logger.info(f"dataset len after filter: {num_data}") self.minibatch = [] - num_data = len(data) # Dynamic batch size if batch_type == 'dynamic': assert (max_frames_in_batch > 0) @@ -277,7 +259,9 @@ class AudioDataset(Dataset): cur = end def __len__(self): + """number of example(batch)""" return len(self.minibatch) def __getitem__(self, idx): + """batch example of idx""" return self.minibatch[idx] diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index e810662d..38ff1396 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -18,8 +18,10 @@ import kaldiio import numpy as np import soundfile -from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation +from .utility import feat_type +from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.log import Log +# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation __all__ = ["LoadInputsAndTargets"] @@ -322,20 +324,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[-1].lower() - if suffix == 'ark': - return 'mat' - elif suffix == 'scp': - return 'scp' - elif suffix == 'npy': - return 'npy' - elif suffix == 'npz': - return 'npz' - elif suffix in ['wav', 'flac']: - # PCM16 - return 'sound' - else: - raise ValueError(f"Not support filetype: {suffix}") + return feat_type(filepath) class SoundHDF5File(): diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py index 392031ba..1a90e3d0 100644 --- a/paddlespeech/s2t/io/utility.py +++ b/paddlespeech/s2t/io/utility.py @@ -17,7 +17,7 @@ import numpy as np from paddlespeech.s2t.utils.log import Log -__all__ = ["pad_list", "pad_sequence"] +__all__ = ["pad_list", "pad_sequence", "feat_type"] logger = Log(__name__).getlog() @@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray], out_tensor[:length, i, ...] = tensor return out_tensor + + +def feat_type(filepath): + suffix = filepath.split(":")[0].split('.')[-1].lower() + if suffix == 'ark': + return 'mat' + elif suffix == 'scp': + return 'scp' + elif suffix == 'npy': + return 'npy' + elif suffix == 'npz': + return 'npz' + elif suffix in ['wav', 'flac']: + # PCM16 + return 'sound' + else: + raise ValueError(f"Not support filetype: {suffix}") diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index df3130da..6956b908 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,6 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +from python_speech_features import logfbank def stft(x, @@ -304,3 +305,85 @@ class IStft(): win_length=self.win_length, window=self.window, center=self.center, ) + + +class LogMelSpectrogramKaldi(): + def __init__( + self, + fs=16000, + n_mels=80, + n_fft=512, # fft point + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + window="povey", + fmin=20, + fmax=None, + eps=1e-10, + dither=False): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + if n_shift > win_length: + raise ValueError("Stride size must not be greater than " + "window size.") + self.n_shift = n_shift / fs # unit: ms + self.win_length = win_length / fs # unit: ms + + self.window = window + self.fmin = fmin + if fmax is None: + fmax_ = fmax if fmax else self.fs / 2 + elif fmax > int(self.fs / 2): + raise ValueError("fmax must not be greater than half of " + "sample rate.") + self.fmax = fmax_ + + self.eps = eps + self.remove_dc_offset = True + self.preemph = 0.97 + self.dither = dither + + def __repr__(self): + return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, )) + + def __call__(self, x): + """ + + Args: + x (np.ndarray): shape (Ti,) + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + if x.dtype == np.int16: + x = x / 2**(16 - 1) + return logfbank( + signal=x, + samplerate=self.fs, + winlen=self.win_length, # unit ms + winstep=self.n_shift, # unit ms + nfilt=self.n_mels, + nfft=self.n_fft, + lowfreq=self.fmin, + highfreq=self.fmax, + dither=self.dither, + remove_dc_offset=self.remove_dc_offset, + preemph=self.preemph, + wintype=self.window) diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 1aee4b36..492d35df 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -45,6 +45,7 @@ import_alias = dict( stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram", wpe="paddlespeech.s2t.transform.wpe:WPE", channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", + fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", ) diff --git a/utils/format_data.py b/utils/format_data.py index 6fe36997..49dcbee8 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -62,24 +62,64 @@ def main(): vocab_size = text_feature.vocab_size print(f"Vocab size: {vocab_size}") + # josnline like this + # { + # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], + # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], + # "utt2spk": "111-2222", + # "utt": "111-2222-333" + # } count = 0 for manifest_path in args.manifest_paths: manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: + output_json = { + "input": [], + "output": [], + 'utt': line_json['utt'], + 'utt2spk': line_json.get('utt2spk', 'global'), + } + + # output line = line_json['text'] - tokens = text_feature.tokenize(line) - tokenids = text_feature.featurize(line) - line_json['token'] = tokens - line_json['token_id'] = tokenids - line_json['token_shape'] = (len(tokenids), vocab_size) - feat_shape = line_json['feat_shape'] - assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': - feat_shape.append(feat_dim) - line_json['filetype'] = 'sound' - else: # kaldi - raise NotImplementedError('no support kaldi feat now!') - fout.write(json.dumps(line_json) + '\n') + if isinstance(line, str): + # only one target + tokens = text_feature.tokenize(line) + tokenids = text_feature.featurize(line) + output_json['output'].append({ + 'name': 'traget1', + 'shape': (len(tokenids), vocab_size), + 'text': line, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) + else: + # isinstance(line, list), multi target + raise NotImplementedError("not support multi output now!") + + # input + line = line_json['feat'] + if isinstance(line, str): + # only one input + feat_shape = line_json['feat_shape'] + assert isinstance(feat_shape, (list, tuple)), type(feat_shape) + filetype = feat_type(line) + if filetype == 'sound': + feat_shape.append(feat_dim) + else: # kaldi + raise NotImplementedError('no support kaldi feat now!') + + output_json['input'].append({ + "name": "input1", + "shape": feat_shape, + "feat": line, + "filetype": filetype, + }) + else: + # isinstance(line, list), multi input + raise NotImplementedError("not support multi input now!") + + fout.write(json.dumps(output_json) + '\n') count += 1 print(f"Examples number: {count}") diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index 79b3d2cb..e0b5ece3 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -79,9 +79,11 @@ def main(): line_json['token1'] = tokens line_json['token_id1'] = tokenids line_json['token_shape1'] = (len(tokenids), vocab_size) + feat_shape = line_json['feat_shape'] assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': + filetype = feat_type(line_json['feat']) + if filetype == 'sound': feat_shape.append(feat_dim) else: # kaldi raise NotImplementedError('no support kaldi feat now!') From 69bccb4f0277aa7cf28248ab9f21121d4fe0e2ec Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 04:45:37 +0000 Subject: [PATCH 02/25] fix ctc align --- paddlespeech/s2t/exps/u2/model.py | 2 +- paddlespeech/s2t/exps/u2_kaldi/model.py | 2 +- paddlespeech/s2t/exps/u2_st/model.py | 6 +++--- paddlespeech/s2t/utils/ctc_utils.py | 15 +++++++-------- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 6b23a985..2f0e752f 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -575,7 +575,7 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align( + ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, self.config.collator.stride_ms, self.vocab_list, self.args.result_file) diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 357a39b9..6c4365b8 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -528,7 +528,7 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align( + ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, self.config.collator.stride_ms, self.vocab_list, self.args.result_file) diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index f458216e..9141b361 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -543,10 +543,10 @@ class U2STTester(U2STTrainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align( + ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + self.config.collator.stride_ms, + self.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py index e005e5d2..f5822e5d 100644 --- a/paddlespeech/s2t/utils/ctc_utils.py +++ b/paddlespeech/s2t/utils/ctc_utils.py @@ -13,7 +13,7 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) from typing import List - +from pathlib import Path import numpy as np import paddle @@ -139,26 +139,27 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, return output_alignment -def ctc_align(model, dataloader, batch_size, stride_ms, token_dict, +def ctc_align(config, model, dataloader, batch_size, stride_ms, token_dict, result_file): """ctc alignment. Args: + config (cfgNode): config model (nn.Layer): U2 Model. dataloader (io.DataLoader): dataloader. batch_size (int): decoding batchsize. stride_ms (int): audio feature stride in ms unit. token_dict (List[str]): vocab list, e.g. ['blank', 'unk', 'a', 'b', '']. - result_file (str): alignment output file, e.g. xxx.align. + result_file (str): alignment output file, e.g. /path/to/xxx.align. """ if batch_size > 1: logger.fatal('alignment mode must be running with batch_size == 1') sys.exit(1) - assert result_file and result_file.endswith('.align') model.eval() - + # conv subsampling rate + subsample = utility.get_subsample(config) logger.info(f"Align Total Examples: {len(dataloader.dataset)}") with open(result_file, 'w') as fout: @@ -187,13 +188,11 @@ def ctc_align(model, dataloader, batch_size, stride_ms, token_dict, logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] - subsample = utility.get_subsample(self.config) - tierformat = text_grid.align_to_tierformat(align_segs, subsample, token_dict) # write tier - align_output_path = Path(self.args.result_file).parent / "align" + align_output_path = Path(result_file).parent / "align" align_output_path.mkdir(parents=True, exist_ok=True) tier_path = align_output_path / (key[0] + ".tier") with tier_path.open('w') as f: From 9cdd2643b11b6350cf27d0ada12614aec369b575 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 06:07:26 +0000 Subject: [PATCH 03/25] fix bug for batch dataloader using --- paddlespeech/s2t/exps/u2/model.py | 2 +- paddlespeech/s2t/models/u2/u2.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 2f0e752f..8dad5074 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -591,7 +591,7 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.model.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.collate_fn.feature_size + feat_dim = self.test_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index fd998271..916a6a05 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -934,8 +934,8 @@ class U2Model(U2DecodeModel): DeepSpeech2Model: The model built from pretrained result. """ with UpdateConfig(config): - config.input_dim = dataloader.collate_fn.feature_size - config.output_dim = dataloader.collate_fn.vocab_size + config.input_dim = dataloader.feat_dim + config.output_dim = dataloader.vocab_size model = cls.from_config(config) From 6a7e0265cdd7b2e15480e5f9a25045a12ac1cb62 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 06:07:45 +0000 Subject: [PATCH 04/25] add josn global cmvn --- examples/tiny/s1/conf/preprocess.yaml | 12 ++++--- examples/tiny/s1/conf/transformer.yaml | 4 +-- paddlespeech/s2t/transform/cmvn.py | 37 ++++++++++++++++++-- paddlespeech/s2t/transform/transformation.py | 1 + 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/examples/tiny/s1/conf/preprocess.yaml b/examples/tiny/s1/conf/preprocess.yaml index 9de0d8c7..dd4cfd27 100644 --- a/examples/tiny/s1/conf/preprocess.yaml +++ b/examples/tiny/s1/conf/preprocess.yaml @@ -1,22 +1,24 @@ process: # extract kaldi fbank from PCM - - type: "fbank_kaldi" + - type: fbank_kaldi fs: 16000 n_mels: 80 n_shift: 160 win_length: 400 dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument - - type: "time_warp" + - type: time_warp max_time_warp: 5 inplace: true - mode: "PIL" - - type: "freq_mask" + mode: PIL + - type: freq_mask F: 30 n_mask: 2 inplace: true replace_with_zero: false - - type: "time_mask" + - type: time_mask T: 40 n_mask: 2 inplace: true diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 87f9c243..1378e848 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -11,7 +11,7 @@ data: max_output_input_ratio: 10.0 collator: - mean_std_filepath: "" + mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py index 4d2d2324..dc9ea87e 100644 --- a/paddlespeech/s2t/transform/cmvn.py +++ b/paddlespeech/s2t/transform/cmvn.py @@ -13,12 +13,11 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) import io - +import json import h5py import kaldiio import numpy as np - class CMVN(): "Apply Global/Spk CMVN/iverserCMVN." @@ -157,3 +156,37 @@ class UtteranceCMVN(): x = np.divide(x, std) return x + + + +class GlobalCMVN(): + "Apply Global CMVN" + + def __init__(self, cmvn_path, norm_means=True, norm_vars=True, std_floor=1.0e-20): + self.cmvn_path = cmvn_path + self.norm_means = norm_means + self.norm_vars = norm_vars + self.std_floor = std_floor + + with open(cmvn_path) as f: + cmvn_stats = json.load(f) + self.count = cmvn_stats['frame_num'] + self.mean = np.array(cmvn_stats['mean_stat']) / self.count + self.square_sums = np.array(cmvn_stats['var_stat']) + self.var = self.square_sums / self.count - self.mean**2 + self.std = np.maximum(np.sqrt(self.var), self.std_floor) + + def __repr__(self): + return f"""{self.__class__.__name__}( + cmvn_path={self.cmvn_path}, + norm_means={self.norm_means}, + norm_vars={self.norm_vars},)""" + + def __call__(self, x, uttid=None): + # x: [Time, Dim] + if self.norm_means: + x = np.subtract(x, self.mean) + + if self.norm_vars: + x = np.divide(x, self.std) + return x \ No newline at end of file diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 492d35df..bfe6c53d 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -46,6 +46,7 @@ import_alias = dict( wpe="paddlespeech.s2t.transform.wpe:WPE", channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", + cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN" ) From 18d9abc7a031a4c2a6fe21f9540e6a0eddd27749 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 07:55:29 +0000 Subject: [PATCH 05/25] add sox speed pertrub --- paddlespeech/s2t/transform/perturb.py | 106 +++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 153d494b..ee4c7ce0 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -16,7 +16,7 @@ import librosa import numpy import scipy import soundfile - +import soxbindings as sox from paddlespeech.s2t.io.reader import SoundHDF5File @@ -82,7 +82,6 @@ class SpeedPerturbation(): def __call__(self, x, uttid=None, train=True): if not train: return x - x = x.astype(numpy.float32) if self.accept_uttid: ratio = self.utt2ratio[uttid] @@ -108,6 +107,109 @@ class SpeedPerturbation(): return y +class SpeedPerturbationSox(): + """SpeedPerturbationSox + + The speed perturbation in kaldi uses sox-speed instead of sox-tempo, + and sox-speed just to resample the input, + i.e pitch and tempo are changed both. + + To speed up or slow down the sound of a file, + use speed to modify the pitch and the duration of the file. + This raises the speed and reduces the time. + The default factor is 1.0 which makes no change to the audio. + 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher. + + "Why use speed option instead of tempo -s in SoX for speed perturbation" + https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8 + + tempo option: + sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9 + + speed option: + sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9 + + If we use speed option like above, the pitch of audio also will be changed, + but the tempo option does not change the pitch. + """ + + def __init__( + self, + lower=0.9, + upper=1.1, + utt2ratio=None, + keep_length=True, + sr=16000, + seed=None, ): + self.sr = sr + self.keep_length = keep_length + self.state = numpy.random.RandomState(seed) + + if utt2ratio is not None: + self.utt2ratio = {} + # Use the scheduled ratio for each utterances + self.utt2ratio_file = utt2ratio + self.lower = None + self.upper = None + self.accept_uttid = True + + with open(utt2ratio, "r") as f: + for line in f: + utt, ratio = line.rstrip().split(None, 1) + ratio = float(ratio) + self.utt2ratio[utt] = ratio + else: + self.utt2ratio = None + # The ratio is given on runtime randomly + self.lower = lower + self.upper = upper + + def __repr__(self): + if self.utt2ratio is None: + return f"""{self.__class__.__name__}( + lower={self.lower}, + upper={self.upper}, + keep_length={self.keep_length}, + sample_rate={self.sr})""" + else: + return f"""{self.__class__.__name__}( + utt2ratio={self.utt2ratio_file}, + sample_rate={self.sr})""" + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + + x = x.astype(numpy.float32) + if self.accept_uttid: + ratio = self.utt2ratio[uttid] + else: + ratio = self.state.uniform(self.lower, self.upper) + + tfm = sox.Transformer() + tfm.set_globals(multithread=False) + tfm.speed(ratio) + y = tfm.build_array(input_array=x, sample_rate_in=self.sr) + + if self.keep_length: + diff = abs(len(x) - len(y)) + if len(y) > len(x): + # Truncate noise + y = y[diff // 2:-((diff + 1) // 2)] + elif len(y) < len(x): + # Assume the time-axis is the first: (Time, Channel) + pad_width = [(diff // 2, (diff + 1) // 2)] + [ + (0, 0) for _ in range(y.ndim - 1) + ] + y = numpy.pad( + y, pad_width=pad_width, constant_values=0, mode="constant") + + if y.ndim == 2 and x.ndim == 1: + # (T, C) -> (T) + y = y.sequence(1) + return y + + class BandpassPerturbation(): """BandpassPerturbation From 44743622d4b894ba5fe7440d35007c87b3258db1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 09:50:19 +0000 Subject: [PATCH 06/25] filter example; cmvn stride and window int; libri/s1 conf --- examples/aishell/s0/local/data.sh | 4 +- examples/aishell/s1/local/data.sh | 4 +- examples/callcenter/s1/local/data.sh | 4 +- examples/dataset/librispeech/librispeech.py | 10 +- examples/librispeech/s0/local/data.sh | 4 +- .../librispeech/s1/conf/chunk_conformer.yaml | 4 +- .../s1/conf/chunk_transformer.yaml | 4 +- examples/librispeech/s1/conf/conformer.yaml | 4 +- examples/librispeech/s1/conf/preprocess.yaml | 29 +++++ examples/librispeech/s1/conf/transformer.yaml | 4 +- examples/librispeech/s1/local/data.sh | 45 +++++--- examples/ted_en_zh/t0/local/data.sh | 4 +- examples/timit/s1/local/data.sh | 4 +- examples/tiny/s0/local/data.sh | 4 +- examples/tiny/s1/local/data.sh | 4 +- utils/compute_mean_std.py | 8 +- utils/format_data.py | 2 +- utils/remove_longshortdata.py | 102 ++++++++++++++++++ 18 files changed, 195 insertions(+), 49 deletions(-) create mode 100644 examples/librispeech/s1/conf/preprocess.yaml create mode 100755 utils/remove_longshortdata.py diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index d0a63dca..23f04f2a 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --spectrum_type="linear" \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --sample_rate=16000 \ --use_dB_normalization=True \ --num_samples=2000 \ diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 8124d1bb..76e28075 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=16000 \ --use_dB_normalization=False \ --num_samples=-1 \ diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index 65e6e5fc..c40c752a 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=8000 \ --use_dB_normalization=False \ --num_samples=-1 \ diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index 0d535e13..69f0db59 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path): print("Creating manifest %s ..." % manifest_path) json_lines = [] total_sec = 0.0 - total_text = 0.0 + total_char = 0.0 total_num = 0 for subfolder, _, filelist in sorted(os.walk(data_dir)): @@ -89,7 +89,7 @@ def create_manifest(data_dir, manifest_path): text_filepath = os.path.join(subfolder, text_filelist[0]) for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() - n_token = len(segments[1:]) + nchars = len(segments[1:]) text = ' '.join(segments[1:]).lower() audio_filepath = os.path.abspath( @@ -110,7 +110,7 @@ def create_manifest(data_dir, manifest_path): })) total_sec += duration - total_text += n_token + total_char += nchars total_num += 1 with codecs.open(manifest_path, 'w', 'utf-8') as out_file: @@ -125,8 +125,8 @@ def create_manifest(data_dir, manifest_path): print(f"{subset}:", file=f) print(f"{total_num} utts", file=f) print(f"{total_sec / (60*60)} h", file=f) - print(f"{total_text} text", file=f) - print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_char} char", file=f) + print(f"{total_char / total_sec} char/sec", file=f) print(f"{total_sec / total_num} sec/utt", file=f) diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index 78a4ffc4..0f276cec 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=True \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 4d0e6ceb..2bfb0fb6 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index c7b53f95..fe533777 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 3bc942dc..c844baaa 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 3cc17004..5a158f3e 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index b15ddce5..35f4e635 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -8,6 +8,11 @@ nbpe=5000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" +stride_ms=10 +window_ms=25 +sample_rate=16000 +feat_dim=80 + source ${MAIN_ROOT}/utils/parse_options.sh @@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - mv data/manifest.${set} data/manifest.${set}.raw + for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do + mv data/manifest.${sub} data/manifest.${sub}.raw done rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw - for set in train-clean-100 train-clean-360 train-other-500; do - cat data/manifest.${set}.raw >> data/manifest.train.raw + for sub in train-clean-100 train-clean-360 train-other-500; do + cat data/manifest.${sub}.raw >> data/manifest.train.raw done - for set in dev-clean dev-other; do - cat data/manifest.${set}.raw >> data/manifest.dev.raw + for sub in dev-clean dev-other; do + cat data/manifest.${sub}.raw >> data/manifest.dev.raw done - for set in test-clean test-other; do - cat data/manifest.${set}.raw >> data/manifest.test.raw + for sub in test-clean test-other; do + cat data/manifest.${sub}.raw >> data/manifest.test.raw done fi @@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ --spectrum_type="fbank" \ - --feat_dim=80 \ + --feat_dim=${feat_dim} \ --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --sample_rate=${sample_rate} \ + --stride_ms=${stride_ms} \ + --window_ms=${window_ms} \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -85,15 +90,15 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size - for set in train dev test dev-clean dev-other test-clean test-other; do + for sub in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" + --manifest_path="data/manifest.${sub}.raw" \ + --output_path="data/manifest.${sub}" if [ $? -ne 0 ]; then echo "Formt mnaifest failed. Terminated." @@ -102,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then }& done wait + + for sub in train dev; do + mv data/manifest.${sub} data/manifest.${sub}.fmt + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + for sub in train dev; do + remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub} + done fi echo "LibriSpeech Data preparation done." diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 23e5a9c7..ce58f539 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index 66be39e2..e588e48d 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index bcf9e6d1..f1fb8cb1 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index 3d7f19ab..87539d5e 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index 296d272a..e47554dc 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -33,8 +33,8 @@ add_arg('spectrum_type', str, choices=['linear', 'mfcc', 'fbank']) add_arg('feat_dim', int, 13, "Audio feature dim.") add_arg('delta_delta', bool, False, "Audio feature with delta delta.") -add_arg('stride_ms', float, 10.0, "stride length in ms.") -add_arg('window_ms', float, 20.0, "stride length in ms.") +add_arg('stride_ms', int, 10, "stride length in ms.") +add_arg('window_ms', int, 20, "stride length in ms.") add_arg('sample_rate', int, 16000, "target sample rate.") add_arg('use_dB_normalization', bool, True, "do dB normalization.") add_arg('target_dB', int, -20, "target dB.") @@ -61,8 +61,8 @@ def main(): spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, - stride_ms=args.stride_ms, - window_ms=args.window_ms, + stride_ms=float(args.stride_ms), + window_ms=float(args.window_ms), n_fft=None, max_freq=None, target_sample_rate=args.sample_rate, diff --git a/utils/format_data.py b/utils/format_data.py index 49dcbee8..f9b5e6aa 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -122,7 +122,7 @@ def main(): fout.write(json.dumps(output_json) + '\n') count += 1 - print(f"Examples number: {count}") + print(f"{args.manifest_paths} Examples number: {count}") fout.close() diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py new file mode 100755 index 00000000..dcc05b23 --- /dev/null +++ b/utils/remove_longshortdata.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""remove longshort data from manifest""" +import logging +import argparse +import jsonlines + +from paddlespeech.s2t.utils.cli_utils import get_commandline_args + +# manifest after format +# josnline like this +# { +# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], +# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], +# "utt2spk": "111-2222", +# "utt": "111-2222-333" +# } + + +def get_parser(): + parser = argparse.ArgumentParser( + description="remove longshort data from format manifest", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") + parser.add_argument( + "--iaxis", default=0, type=int, help="multi inputs index, 0 is the first") + parser.add_argument( + "--oaxis", default=0, type=int, help="multi outputs index, 0 is the first") + parser.add_argument( + "--maxframes", default=2000, type=int, help="maxframes") + parser.add_argument( + "--minframes", default=10, type=int, help="minframes") + parser.add_argument( + "--maxchars", default=200, type=int, help="max tokens") + parser.add_argument( + "--minchars", default=0, type=int, help="min tokens") + parser.add_argument( + "--stride_ms", default=10, type=int, help="stride in ms unit.") + parser.add_argument( + "rspecifier", + type=str, + help="jsonl format manifest. e.g. manifest.jsonl") + parser.add_argument( + "wspecifier_or_wxfilename", + type=str, + help="Write specifier. e.g. manifest.jsonl") + return parser + + +def filter_input(args, line): + tmp = line['input'][args.iaxis] + if args.sound: + # second to frame + nframe = tmp['shape'][0] * 1000 / args.stride_ms + else: + nframe = tmp['shape'][0] + + if nframe < args.minframes or nframe > args.maxframes: + return True + else: + return False + + +def filter_output(args, line): + nchars = len(line['output'][args.iaxis]['text']) + if nchars < args.minchars or nchars > args.maxchars: + return True + else: + return False + + +def main(): + args = get_parser().parse_args() + + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + with jsonlines.open(args.rspecifier, 'r') as reader: + lines = list(reader) + logging.info(f"Example: {len(lines)}") + feat = lines[0]['input'][args.iaxis]['feat'] + args.soud = False + if feat.split('.')[-1] not in 'ark, scp': + args.sound = True + + count = 0 + filter = 0 + with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer: + for line in lines: + if filter_input(args, line) or filter_output(args, line): + filter += 1 + continue + writer.write(line) + count += 1 + logging.info(f"Example after filter: {count}\{filter}") + +if __name__ == '__main__': + main() \ No newline at end of file From 7b3a901b0826bcacc1920930f301077817b100a8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 10:31:35 +0000 Subject: [PATCH 07/25] more conf with preprocess.yaml --- examples/aishell/s1/conf/chunk_conformer.yaml | 4 +-- examples/aishell/s1/conf/conformer.yaml | 4 +-- examples/aishell/s1/conf/preprocess.yaml | 29 +++++++++++++++++++ .../callcenter/s1/conf/chunk_conformer.yaml | 4 +-- examples/callcenter/s1/conf/conformer.yaml | 4 +-- examples/callcenter/s1/conf/preprocess.yaml | 29 +++++++++++++++++++ examples/timit/s1/conf/preprocess.yaml | 29 +++++++++++++++++++ examples/timit/s1/conf/transformer.yaml | 4 +-- 8 files changed, 97 insertions(+), 10 deletions(-) create mode 100644 examples/aishell/s1/conf/preprocess.yaml create mode 100644 examples/callcenter/s1/conf/preprocess.yaml create mode 100644 examples/timit/s1/conf/preprocess.yaml diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 8682538b..336a6c46 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index 71cd044e..0e9d79d8 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/aishell/s1/conf/preprocess.yaml b/examples/aishell/s1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/aishell/s1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml index a853658a..b18b46fe 100644 --- a/examples/callcenter/s1/conf/chunk_conformer.yaml +++ b/examples/callcenter/s1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml index bd4f4578..47c438a6 100644 --- a/examples/callcenter/s1/conf/conformer.yaml +++ b/examples/callcenter/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/callcenter/s1/conf/preprocess.yaml b/examples/callcenter/s1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/callcenter/s1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/timit/s1/conf/preprocess.yaml b/examples/timit/s1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/timit/s1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml index d3ced898..1d18468b 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/s1/conf/transformer.yaml @@ -14,7 +14,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: "word" mean_std_filepath: "" - augmentation_config: "" + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer From fb853167d353f3b0e74e56dc1fbaa214fbbcb4fa Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 10:35:37 +0000 Subject: [PATCH 08/25] format code --- paddlespeech/s2t/exps/u2/model.py | 8 ++--- paddlespeech/s2t/exps/u2_kaldi/model.py | 8 ++--- paddlespeech/s2t/exps/u2_st/model.py | 8 ++--- paddlespeech/s2t/transform/cmvn.py | 11 ++++-- paddlespeech/s2t/transform/perturb.py | 2 ++ paddlespeech/s2t/transform/transformation.py | 3 +- paddlespeech/s2t/utils/ctc_utils.py | 3 +- utils/remove_longshortdata.py | 38 +++++++++++--------- 8 files changed, 46 insertions(+), 35 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 8dad5074..7eed9391 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -575,10 +575,10 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align(self.config, - self.model, self.align_loader, self.config.decoding.batch_size, - self.config.collator.stride_ms, - self.vocab_list, self.args.result_file) + ctc_utils.ctc_align(self.config, self.model, self.align_loader, + self.config.decoding.batch_size, + self.config.collator.stride_ms, self.vocab_list, + self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 6c4365b8..d82034c8 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -528,10 +528,10 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align(self.config, - self.model, self.align_loader, self.config.decoding.batch_size, - self.config.collator.stride_ms, - self.vocab_list, self.args.result_file) + ctc_utils.ctc_align(self.config, self.model, self.align_loader, + self.config.decoding.batch_size, + self.config.collator.stride_ms, self.vocab_list, + self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 9141b361..91390afe 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -543,10 +543,10 @@ class U2STTester(U2STTrainer): @paddle.no_grad() def align(self): - ctc_utils.ctc_align(self.config, - self.model, self.align_loader, self.config.decoding.batch_size, - self.config.collator.stride_ms, - self.vocab_list, self.args.result_file) + ctc_utils.ctc_align(self.config, self.model, self.align_loader, + self.config.decoding.batch_size, + self.config.collator.stride_ms, self.vocab_list, + self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py index dc9ea87e..aa1e6b44 100644 --- a/paddlespeech/s2t/transform/cmvn.py +++ b/paddlespeech/s2t/transform/cmvn.py @@ -14,10 +14,12 @@ # Modified from espnet(https://github.com/espnet/espnet) import io import json + import h5py import kaldiio import numpy as np + class CMVN(): "Apply Global/Spk CMVN/iverserCMVN." @@ -158,11 +160,14 @@ class UtteranceCMVN(): return x - class GlobalCMVN(): "Apply Global CMVN" - def __init__(self, cmvn_path, norm_means=True, norm_vars=True, std_floor=1.0e-20): + def __init__(self, + cmvn_path, + norm_means=True, + norm_vars=True, + std_floor=1.0e-20): self.cmvn_path = cmvn_path self.norm_means = norm_means self.norm_vars = norm_vars @@ -189,4 +194,4 @@ class GlobalCMVN(): if self.norm_vars: x = np.divide(x, self.std) - return x \ No newline at end of file + return x diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index ee4c7ce0..873adb0b 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -17,6 +17,7 @@ import numpy import scipy import soundfile import soxbindings as sox + from paddlespeech.s2t.io.reader import SoundHDF5File @@ -171,6 +172,7 @@ class SpeedPerturbationSox(): upper={self.upper}, keep_length={self.keep_length}, sample_rate={self.sr})""" + else: return f"""{self.__class__.__name__}( utt2ratio={self.utt2ratio_file}, diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index bfe6c53d..381b0cdc 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -46,8 +46,7 @@ import_alias = dict( wpe="paddlespeech.s2t.transform.wpe:WPE", channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", - cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN" -) + cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN") class Transformation(): diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py index f5822e5d..886b7203 100644 --- a/paddlespeech/s2t/utils/ctc_utils.py +++ b/paddlespeech/s2t/utils/ctc_utils.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) -from typing import List from pathlib import Path +from typing import List + import numpy as np import paddle diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py index dcc05b23..131b4a58 100755 --- a/utils/remove_longshortdata.py +++ b/utils/remove_longshortdata.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 """remove longshort data from manifest""" -import logging import argparse +import logging + import jsonlines from paddlespeech.s2t.utils.cli_utils import get_commandline_args @@ -23,17 +24,19 @@ def get_parser(): parser.add_argument( "--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( - "--iaxis", default=0, type=int, help="multi inputs index, 0 is the first") - parser.add_argument( - "--oaxis", default=0, type=int, help="multi outputs index, 0 is the first") - parser.add_argument( - "--maxframes", default=2000, type=int, help="maxframes") - parser.add_argument( - "--minframes", default=10, type=int, help="minframes") + "--iaxis", + default=0, + type=int, + help="multi inputs index, 0 is the first") parser.add_argument( - "--maxchars", default=200, type=int, help="max tokens") - parser.add_argument( - "--minchars", default=0, type=int, help="min tokens") + "--oaxis", + default=0, + type=int, + help="multi outputs index, 0 is the first") + parser.add_argument("--maxframes", default=2000, type=int, help="maxframes") + parser.add_argument("--minframes", default=10, type=int, help="minframes") + parser.add_argument("--maxchars", default=200, type=int, help="max tokens") + parser.add_argument("--minchars", default=0, type=int, help="min tokens") parser.add_argument( "--stride_ms", default=10, type=int, help="stride in ms unit.") parser.add_argument( @@ -54,7 +57,7 @@ def filter_input(args, line): nframe = tmp['shape'][0] * 1000 / args.stride_ms else: nframe = tmp['shape'][0] - + if nframe < args.minframes or nframe > args.maxframes: return True else: @@ -67,7 +70,7 @@ def filter_output(args, line): return True else: return False - + def main(): args = get_parser().parse_args() @@ -78,15 +81,15 @@ def main(): else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) - + with jsonlines.open(args.rspecifier, 'r') as reader: lines = list(reader) logging.info(f"Example: {len(lines)}") feat = lines[0]['input'][args.iaxis]['feat'] - args.soud = False + args.soud = False if feat.split('.')[-1] not in 'ark, scp': args.sound = True - + count = 0 filter = 0 with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer: @@ -98,5 +101,6 @@ def main(): count += 1 logging.info(f"Example after filter: {count}\{filter}") + if __name__ == '__main__': - main() \ No newline at end of file + main() From d62092ac289ec8adbbff5ca2126a459449367e1f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 8 Nov 2021 06:02:41 +0000 Subject: [PATCH 09/25] fix specaug param --- examples/librispeech/s1/conf/preprocess.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml index dd4cfd27..bcbc7ad2 100644 --- a/examples/librispeech/s1/conf/preprocess.yaml +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -10,19 +10,19 @@ process: cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument - type: time_warp - max_time_warp: 5 + max_time_warp: 0 inplace: true mode: PIL - type: freq_mask - F: 30 + F: 10 n_mask: 2 inplace: true - replace_with_zero: false + replace_with_zero: true - type: time_mask - T: 40 + T: 50 n_mask: 2 inplace: true - replace_with_zero: false + replace_with_zero: true From 8b0e344c6983a28057bfd60e32cc6ef9af91c584 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 8 Nov 2021 06:53:22 +0000 Subject: [PATCH 10/25] fix logfbank using PCM16 --- examples/librispeech/s1/conf/preprocess.yaml | 4 -- paddlespeech/s2t/frontend/audio.py | 30 ++---------- paddlespeech/s2t/frontend/utility.py | 51 +++++++++++++++++++- paddlespeech/s2t/transform/spectrogram.py | 27 +++++++++-- 4 files changed, 76 insertions(+), 36 deletions(-) diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml index bcbc7ad2..97ebf41d 100644 --- a/examples/librispeech/s1/conf/preprocess.yaml +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -23,7 +23,3 @@ process: n_mask: 2 inplace: true replace_with_zero: true - - - - diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 13dc3a44..4171f85b 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -25,6 +25,8 @@ import soxbindings as sox from scipy import signal from .utility import subfile_from_tar +from .utility import convert_samples_to_float32 +from .utility import convert_samples_from_float32 class AudioSegment(): @@ -689,15 +691,7 @@ class AudioSegment(): Audio sample type is usually integer or float-point. Integers will be scaled to [-1, 1] in float32. """ - float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: - bits = np.iinfo(samples.dtype).bits - float32_samples *= (1. / 2**(bits - 1)) - elif samples.dtype in np.sctypes['float']: - pass - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return float32_samples + return convert_samples_to_float32(samples) def _convert_samples_from_float32(self, samples, dtype): """Convert sample type from float32 to dtype. @@ -708,20 +702,4 @@ class AudioSegment(): This is for writing a audio file. """ - dtype = np.dtype(dtype) - output_samples = samples.copy() - if dtype in np.sctypes['int']: - bits = np.iinfo(dtype).bits - output_samples *= (2**(bits - 1) / 1.) - min_val = np.iinfo(dtype).min - max_val = np.iinfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - elif samples.dtype in np.sctypes['float']: - min_val = np.finfo(dtype).min - max_val = np.finfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return output_samples.astype(dtype) + return convert_samples_from_float32(samples, dtype) diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 089890d2..58e5b1b0 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -30,7 +30,8 @@ logger = Log(__name__).getlog() __all__ = [ "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", - "EOS", "UNK", "BLANK", "MASKCTC", "SPACE" + "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32", + "convert_samples_from_float32" ] IGNORE_ID = -1 @@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str): else: raise ValueError(f"cmvn file type no support: {filetype}") return cmvn[0], cmvn[1] + + +def convert_samples_to_float32(samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + + PCM16 -> PCM32 + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + +def convert_samples_from_float32(samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + PCM32 -> PCM16 + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) + diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 6956b908..9e576d0d 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -307,6 +307,9 @@ class IStft(): center=self.center, ) +from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + class LogMelSpectrogramKaldi(): def __init__( self, @@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi(): def __repr__(self): return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " "n_shift={n_shift}, win_length={win_length}, window={window}, " - "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format( name=self.__class__.__name__, fs=self.fs, n_mels=self.n_mels, @@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi(): window=self.window, fmin=self.fmin, fmax=self.fmax, - eps=self.eps, )) + eps=self.eps, + preemph=self.preemph, + window=self.window, + dither=self.dither)) def __call__(self, x): """ @@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi(): """ if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") - if x.dtype == np.int16: - x = x / 2**(16 - 1) - return logfbank( + + logger.info(f"in {x}") + if x.dtype in np.sctypes['float']: + # PCM32 -> PCM16 + bits = np.iinfo(np.int16).bits + x = x * 2**(bits - 1) + logger.info(f"b {x}") + + # logfbank need PCM16 input + y = logfbank( signal=x, samplerate=self.fs, winlen=self.win_length, # unit ms @@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi(): remove_dc_offset=self.remove_dc_offset, preemph=self.preemph, wintype=self.window) + logger.info(f"a {y}") + + + return y From 9a71c091c575a204a73128fc31034a7f0d9587a7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 8 Nov 2021 07:09:07 +0000 Subject: [PATCH 11/25] remove debug info and format code --- examples/librispeech/s1/conf/preprocess.yaml | 10 ++--- paddlespeech/s2t/frontend/audio.py | 4 +- paddlespeech/s2t/frontend/utility.py | 1 - paddlespeech/s2t/transform/spec_augment.py | 3 ++ paddlespeech/s2t/transform/spectrogram.py | 40 ++++++++------------ 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml index 97ebf41d..021ca4c5 100644 --- a/examples/librispeech/s1/conf/preprocess.yaml +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -10,16 +10,16 @@ process: cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument - type: time_warp - max_time_warp: 0 + max_time_warp: 5 inplace: true mode: PIL - type: freq_mask - F: 10 + F: 30 n_mask: 2 inplace: true - replace_with_zero: true + replace_with_zero: false - type: time_mask - T: 50 + T: 40 n_mask: 2 inplace: true - replace_with_zero: true + replace_with_zero: false diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 4171f85b..65dccad3 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -24,9 +24,9 @@ import soundfile import soxbindings as sox from scipy import signal -from .utility import subfile_from_tar -from .utility import convert_samples_to_float32 from .utility import convert_samples_from_float32 +from .utility import convert_samples_to_float32 +from .utility import subfile_from_tar class AudioSegment(): diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 58e5b1b0..703f2127 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -390,4 +390,3 @@ def convert_samples_from_float32(samples, dtype): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) - diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/s2t/transform/spec_augment.py index 83e4e2e7..5ce95085 100644 --- a/paddlespeech/s2t/transform/spec_augment.py +++ b/paddlespeech/s2t/transform/spec_augment.py @@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): :returns numpy.ndarray: time warped spectrogram (time, freq) """ window = max_time_warp + if window == 0: + return x + if mode == "PIL": t = x.shape[0] if t - window <= window: diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 9e576d0d..da91ef92 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -307,9 +307,6 @@ class IStft(): center=self.center, ) -from paddlespeech.s2t.utils.log import Log -logger = Log(__name__).getlog() - class LogMelSpectrogramKaldi(): def __init__( self, @@ -347,22 +344,22 @@ class LogMelSpectrogramKaldi(): self.dither = dither def __repr__(self): - return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " - "n_shift={n_shift}, win_length={win_length}, window={window}, " - "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format( - name=self.__class__.__name__, - fs=self.fs, - n_mels=self.n_mels, - n_fft=self.n_fft, - n_shift=self.n_shift, - win_length=self.win_length, - window=self.window, - fmin=self.fmin, - fmax=self.fmax, - eps=self.eps, - preemph=self.preemph, - window=self.window, - dither=self.dither)) + return ( + "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + preemph=self.preemph, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, + dither=self.dither, )) def __call__(self, x): """ @@ -379,12 +376,10 @@ class LogMelSpectrogramKaldi(): if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") - logger.info(f"in {x}") if x.dtype in np.sctypes['float']: # PCM32 -> PCM16 bits = np.iinfo(np.int16).bits x = x * 2**(bits - 1) - logger.info(f"b {x}") # logfbank need PCM16 input y = logfbank( @@ -400,7 +395,4 @@ class LogMelSpectrogramKaldi(): remove_dc_offset=self.remove_dc_offset, preemph=self.preemph, wintype=self.window) - logger.info(f"a {y}") - - return y From 3046a22719f722ab1bd9f073c1f9edf5e414af80 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 16 Nov 2021 07:25:34 +0000 Subject: [PATCH 12/25] aishell support utt2spk --- examples/aishell/s1/conf/transformer.yaml | 112 ++++++++++++++++++++++ examples/dataset/aishell/aishell.py | 4 + 2 files changed, 116 insertions(+) create mode 100644 examples/aishell/s1/conf/transformer.yaml diff --git a/examples/aishell/s1/conf/transformer.yaml b/examples/aishell/s1/conf/transformer.yaml new file mode 100644 index 00000000..7803097a --- /dev/null +++ b/examples/aishell/s1/conf/transformer.yaml @@ -0,0 +1,112 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 + max_input_len: 20.0 # second + min_output_len: 0.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + +# network architecture +model: + cmvn_file: + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 240 + accum_grad: 2 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.002 + weight_decay: 1e-6 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index 66e06901..95ed0408 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix): # if no transcription for audio then skipped if audio_id not in transcript_dict: continue + + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) text = transcript_dict[audio_id] @@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': str(utt2spk), 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text From 3bd87bc37913c14e9cb86bd3735579d640f4dcb6 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 17 Nov 2021 09:44:16 +0000 Subject: [PATCH 13/25] add wenet lincense --- paddlespeech/s2t/modules/attention.py | 1 + paddlespeech/s2t/modules/cmvn.py | 1 + paddlespeech/s2t/modules/conformer_convolution.py | 1 + paddlespeech/s2t/modules/decoder.py | 1 + paddlespeech/s2t/modules/decoder_layer.py | 1 + paddlespeech/s2t/modules/embedding.py | 1 + paddlespeech/s2t/modules/encoder.py | 1 + paddlespeech/s2t/modules/encoder_layer.py | 1 + paddlespeech/s2t/modules/loss.py | 1 + paddlespeech/s2t/modules/mask.py | 1 + paddlespeech/s2t/modules/positionwise_feed_forward.py | 1 + paddlespeech/s2t/modules/subsampling.py | 1 + 12 files changed, 12 insertions(+) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 80eaf975..3d5f8cd1 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 6e97f824..67f71b66 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 7601a5cc..7ec92554 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index b0ab869a..6b4d9591 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 4d516068..520b18de 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 9207658f..5d4e9175 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 0cde5b9f..5c8ba081 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 29d5a2d8..d39c0695 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index 5750f5a0..c7d9bd45 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 6576cb92..d6b63761 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py index 347264e9..e2619cd4 100644 --- a/paddlespeech/s2t/modules/positionwise_feed_forward.py +++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 759bd540..99a8300f 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From b9790d03f2564c9d6a5361d34a4e81c26d1db4bc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 18 Nov 2021 02:51:20 +0000 Subject: [PATCH 14/25] add wenetspeech egs --- examples/aishell/s1/run.sh | 2 +- examples/wenetspeech/asr1/conf/conformer.yaml | 113 +++++++++++++++ .../wenetspeech/asr1/conf/preprocess.yaml | 29 ++++ examples/wenetspeech/asr1/local/data.sh | 129 +++++++++++++++++ .../wenetspeech/asr1/local/extract_meta.py | 102 +++++++++++++ .../wenetspeech/asr1/local/process_opus.py | 89 ++++++++++++ examples/wenetspeech/asr1/local/test.sh | 1 + .../asr1/local/wenetspeech_data_prep.sh | 135 ++++++++++++++++++ examples/wenetspeech/asr1/path.sh | 15 ++ examples/wenetspeech/asr1/run.sh | 55 +++++++ 10 files changed, 669 insertions(+), 1 deletion(-) create mode 100644 examples/wenetspeech/asr1/conf/conformer.yaml create mode 100644 examples/wenetspeech/asr1/conf/preprocess.yaml create mode 100644 examples/wenetspeech/asr1/local/data.sh create mode 100644 examples/wenetspeech/asr1/local/extract_meta.py create mode 100644 examples/wenetspeech/asr1/local/process_opus.py create mode 100644 examples/wenetspeech/asr1/local/test.sh create mode 100644 examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh create mode 100644 examples/wenetspeech/asr1/path.sh create mode 100644 examples/wenetspeech/asr1/run.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 126c8e4e..94c2c4df 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -53,5 +53,5 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml new file mode 100644 index 00000000..0340dc85 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -0,0 +1,113 @@ +# network architecture +model: + # encoder related + encoder: conformer + encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.1 # second + max_input_len: 12.0 # second + min_output_len: 1.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +training: + n_epoch: 240 + accum_grad: 16 + global_grad_clip: 5.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + optim: adam + optim_conf: + lr: 0.001 + weight_decay: 1e-6 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 + + +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh new file mode 100644 index 00000000..67b3d5a5 --- /dev/null +++ b/examples/wenetspeech/asr1/local/data.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# NPU, ASLP Group (Author: Qijie Shao) + +stage=-1 +stop_stage=100 + +# Use your own data path. You need to download the WenetSpeech dataset by yourself. +wenetspeech_data_dir=./wenetspeech +# Make sure you have 1.2T for ${shards_dir} +shards_dir=./wenetspeech_shards + +#wenetspeech training set +set=L +train_set=train_`echo $set | tr 'A-Z' 'a-z'` +dev_set=dev +test_sets="test_net test_meeting" + +cmvn=true +cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn + + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; +set -u +set -o pipefail + + +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + # download data + echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data." + exit 0; +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Data preparation" + local/wenetspeech_data_prep.sh \ + --train-subset $set \ + $wenetspeech_data_dir \ + data || exit 1; +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # generate manifests + python3 ${TARGET_DIR}/aishell/aishell.py \ + --manifest_prefix="data/manifest" \ + --target_dir="${TARGET_DIR}/aishell" + + if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 + fi + + for dataset in train dev test; do + mv data/manifest.${dataset} data/manifest.${dataset}.raw + done +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # compute mean and stddev for normalizer + if $cmvn; then + full_size=`cat data/${train_set}/wav.scp | wc -l` + sampling_size=$((full_size / cmvn_sampling_divisor)) + shuf -n $sampling_size data/$train_set/wav.scp \ + > data/$train_set/wav.scp.sampled + num_workers=$(nproc) + + python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ + --manifest_path="data/manifest.train.raw" \ + --spectrum_type="fbank" \ + --feat_dim=80 \ + --delta_delta=false \ + --stride_ms=10 \ + --window_ms=25 \ + --sample_rate=16000 \ + --use_dB_normalization=False \ + --num_samples=-1 \ + --num_workers=${num_workers} \ + --output_path="data/mean_std.json" + + if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 + fi + fi +fi + +dict=data/dict/lang_char.txt +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download data, generate manifests + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type="char" \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths "data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # format manifest with tokenids, vocab size + for dataset in train dev test; do + { + python3 ${MAIN_ROOT}/utils/format_data.py \ + --cmvn_path "data/mean_std.json" \ + --unit_type "char" \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${dataset}.raw" \ + --output_path="data/manifest.${dataset}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + } & + done + wait +fi + +echo "Aishell data preparation done." +exit 0 diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py new file mode 100644 index 00000000..4de0b7d4 --- /dev/null +++ b/examples/wenetspeech/asr1/local/extract_meta.py @@ -0,0 +1,102 @@ +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import argparse +import json + + +def get_args(): + parser = argparse.ArgumentParser(description=""" + This script is used to process raw json dataset of WenetSpeech, + where the long wav is splitinto segments and + data of wenet format is generated. + """) + parser.add_argument('input_json', help="""Input json file of WenetSpeech""") + parser.add_argument('output_dir', help="""Output dir for prepared data""") + + args = parser.parse_args() + return args + + +def meta_analysis(input_json, output_dir): + input_dir = os.path.dirname(input_json) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + try: + with open(input_json, 'r') as injson: + json_data = json.load(injson) + except Exception: + sys.exit(f'Failed to load input json file: {input_json}') + else: + if json_data['audios'] is not None: + with open(f'{output_dir}/text', 'w') as utt2text, \ + open(f'{output_dir}/segments', 'w') as segments, \ + open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ + open(f'{output_dir}/wav.scp', 'w') as wavscp, \ + open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ + open(f'{output_dir}/reco2dur', 'w') as reco2dur: + for long_audio in json_data['audios']: + try: + long_audio_path = os.path.realpath( + os.path.join(input_dir, long_audio['path'])) + aid = long_audio['aid'] + segments_lists = long_audio['segments'] + duration = long_audio['duration'] + assert (os.path.exists(long_audio_path)) + except AssertionError: + print(f'''Warning: {aid} something is wrong, + maybe AssertionError, skipped''') + continue + except Exception: + print(f'''Warning: {aid} something is wrong, maybe the + error path: {long_audio_path}, skipped''') + continue + else: + wavscp.write(f'{aid}\t{long_audio_path}\n') + reco2dur.write(f'{aid}\t{duration}\n') + for segment_file in segments_lists: + try: + sid = segment_file['sid'] + start_time = segment_file['begin_time'] + end_time = segment_file['end_time'] + dur = end_time - start_time + text = segment_file['text'] + segment_subsets = segment_file["subsets"] + except Exception: + print(f'''Warning: {segment_file} something + is wrong, skipped''') + continue + else: + utt2text.write(f'{sid}\t{text}\n') + segments.write( + f'{sid}\t{aid}\t{start_time}\t{end_time}\n' + ) + utt2dur.write(f'{sid}\t{dur}\n') + segment_sub_names = " ".join(segment_subsets) + utt2subsets.write( + f'{sid}\t{segment_sub_names}\n') + +def main(): + args = get_args() + + meta_analysis(args.input_json, args.output_dir) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py new file mode 100644 index 00000000..603e0082 --- /dev/null +++ b/examples/wenetspeech/asr1/local/process_opus.py @@ -0,0 +1,89 @@ +# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# process_opus.py: segmentation and downsampling of opus audio + +# usage: python3 process_opus.py wav.scp segments output_wav.scp + +from pydub import AudioSegment +import sys +import os + + +def read_file(wav_scp, segments): + wav_scp_dict = {} + with open(wav_scp, 'r', encoding='UTF-8') as fin: + for line_str in fin: + wav_id, path = line_str.strip().split() + wav_scp_dict[wav_id] = path + + utt_list = [] + seg_path_list = [] + start_time_list = [] + end_time_list = [] + with open(segments, 'r', encoding='UTF-8') as fin: + for line_str in fin: + arr = line_str.strip().split() + assert len(arr) == 4 + utt_list.append(arr[0]) + seg_path_list.append(wav_scp_dict[arr[1]]) + start_time_list.append(float(arr[2])) + end_time_list.append(float(arr[3])) + return utt_list, seg_path_list, start_time_list, end_time_list + + +# TODO(Qijie): Fix the process logic +def output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list): + num_utts = len(utt_list) + step = int(num_utts * 0.01) + with open(output_wav_scp, 'w', encoding='UTF-8') as fout: + previous_wav_path = "" + for i in range(num_utts): + utt_id = utt_list[i] + current_wav_path = seg_path_list[i] + output_dir = (os.path.dirname(current_wav_path)) \ + .replace("audio", 'audio_seg') + seg_wav_path = os.path.join(output_dir, utt_id + '.wav') + + # if not os.path.exists(output_dir): + # os.makedirs(output_dir) + + if current_wav_path != previous_wav_path: + source_wav = AudioSegment.from_file(current_wav_path) + previous_wav_path = current_wav_path + + start = int(start_time_list[i] * 1000) + end = int(end_time_list[i] * 1000) + target_audio = source_wav[start:end].set_frame_rate(16000) + target_audio.export(seg_wav_path, format="wav") + + fout.write("{} {}\n".format(utt_id, seg_wav_path)) + if i % step == 0: + print("seg wav finished: {}%".format(int(i / step))) + + +def main(): + wav_scp = sys.argv[1] + segments = sys.argv[2] + output_wav_scp = sys.argv[3] + + utt_list, seg_path_list, start_time_list, end_time_list \ + = read_file(wav_scp, segments) + output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh new file mode 100644 index 00000000..e7c64346 --- /dev/null +++ b/examples/wenetspeech/asr1/local/test.sh @@ -0,0 +1 @@ +decode_modes="attention_rescoring ctc_greedy_search" \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh new file mode 100644 index 00000000..85853053 --- /dev/null +++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash + +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Seasalt AI, Inc (Author: Guoguo Chen) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# NPU, ASLP Group (Author: Qijie Shao) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -o pipefail + +stage=1 +prefix= +train_subset=L + +. ./tools/parse_options.sh || exit 1; + +filter_by_id () { + idlist=$1 + input=$2 + output=$3 + field=1 + if [ $# -eq 4 ]; then + field=$4 + fi + cat $input | perl -se ' + open(F, "<$idlist") || die "Could not open id-list file $idlist"; + while() { + @A = split; + @A>=1 || die "Invalid id-list file line $_"; + $seen{$A[0]} = 1; + } + while(<>) { + @A = split; + @A > 0 || die "Invalid file line $_"; + @A >= $field || die "Invalid file line $_"; + if ($seen{$A[$field-1]}) { + print $_; + } + }' -- -idlist="$idlist" -field="$field" > $output ||\ + (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; +} + +subset_data_dir () { + utt_list=$1 + src_dir=$2 + dest_dir=$3 + mkdir -p $dest_dir || exit 1; + # wav.scp text segments utt2dur + filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ + (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; + filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ + (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; + filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ + (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; + awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco + filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ + (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; + rm -f $dest_dir/reco +} + +if [ $# -ne 2 ]; then + echo "Usage: $0 [options] " + echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/" + echo "" + echo "This script takes the WenetSpeech source directory, and prepares the" + echo "WeNet format data directory." + echo " --prefix # Prefix for output data directory." + echo " --stage # Processing stage." + echo " --train-subset # Train subset to be created." + exit 1 +fi + +wenetspeech_dir=$1 +data_dir=$2 + +declare -A subsets +subsets=( + [L]="train_l" + [M]="train_m" + [S]="train_s" + [W]="train_w" + [DEV]="dev" + [TEST_NET]="test_net" + [TEST_MEETING]="test_meeting") + +prefix=${prefix:+${prefix}_} + +corpus_dir=$data_dir/${prefix}corpus/ +if [ $stage -le 1 ]; then + echo "$0: Extract meta into $corpus_dir" + # Sanity check. + [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\ + echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1; + [ ! -d $wenetspeech_dir/audio ] &&\ + echo "$0: Please download $wenetspeech_dir/audio!" && exit 1; + + [ ! -d $corpus_dir ] && mkdir -p $corpus_dir + + # Files to be created: + # wav.scp text segments utt2dur + python3 local/extract_meta.py \ + $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: Split data to train, dev, test_net, and test_meeting" + [ ! -f $corpus_dir/utt2subsets ] &&\ + echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; + for label in $train_subset DEV TEST_NET TEST_MEETING; do + if [ ! ${subsets[$label]+set} ]; then + echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1; + fi + subset=${subsets[$label]} + [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset + cat $corpus_dir/utt2subsets | \ + awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \ + > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; + subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ + $corpus_dir $data_dir/${prefix}$subset || exit 1; + done +fi + +echo "$0: Done" \ No newline at end of file diff --git a/examples/wenetspeech/asr1/path.sh b/examples/wenetspeech/asr1/path.sh new file mode 100644 index 00000000..666b29bc --- /dev/null +++ b/examples/wenetspeech/asr1/path.sh @@ -0,0 +1,15 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# model exp +MODEL=u2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh new file mode 100644 index 00000000..8c4a12cb --- /dev/null +++ b/examples/wenetspeech/asr1/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +. path.sh || exit 1; +set -e + +gpus=0,1,2,3,4,5,6,7 +stage=0 +stop_stage=100 +conf_path=conf/conformer.yaml + +average_checkpoint=true +avg_num=10 + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') +echo "checkpoint name ${ckpt}" + +audio_file="data/tmp.wav" + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 +fi From a7858551b735594e8c418de5c4807b47cdcfa5cf Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 19 Nov 2021 09:49:38 +0000 Subject: [PATCH 15/25] add utt2spk for all dataset --- .../dataset/aidatatang_200zh/aidatatang_200zh.py | 3 +++ .../dataset/mini_librispeech/mini_librispeech.py | 13 +++++++------ examples/dataset/thchs30/thchs30.py | 3 +++ examples/dataset/timit/timit.py | 4 ++-- .../dataset/timit/timit_kaldi_standard_split.py | 8 ++++++++ examples/dataset/voxforge/voxforge.py | 5 ++++- 6 files changed, 27 insertions(+), 9 deletions(-) diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py index e32f619e..85f478c2 100644 --- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py +++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix): audio_path = os.path.abspath(os.path.join(subfolder, fname)) audio_id = os.path.basename(fname)[:-4] + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) @@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': str(utt2spk), 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text, diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py index 65fee81a..730c73a8 100644 --- a/examples/dataset/mini_librispeech/mini_librispeech.py +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path): audio_filepath = os.path.join(subfolder, segments[0] + '.flac') audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, 'feat_shape': (duration, ), #second - 'text': - text + 'text': text, })) total_sec += duration diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 77a264cb..2ec4ddab 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix): assert os.path.exists(audio_path) and os.path.exists(text_path) audio_id = os.path.basename(audio_path)[:-4] + spk = audio_id.split('_')[0] + word_text, syllable_text, phone_text = read_trn(text_path) audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) @@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk', spk, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': word_text, # charactor diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py index 311d445c..c4a9f066 100644 --- a/examples/dataset/timit/timit.py +++ b/examples/dataset/timit/timit.py @@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': utt_id, + 'utt2spk': spk, + 'utt2gender': gender, 'feat': str(audio_path), 'feat_shape': (duration, ), # second 'text': word_text, # word 'phone': phone_text, - 'spk': spk, - 'gender': gender, }, ensure_ascii=False)) diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py index 2b494c06..26aa76c7 100644 --- a/examples/dataset/timit/timit_kaldi_standard_split.py +++ b/examples/dataset/timit/timit_kaldi_standard_split.py @@ -24,6 +24,7 @@ import json import os import soundfile +from pathlib import Path parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix): audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) text = phn_dict[audio_id] + + gender_spk = str(Path(audio_path).parent.stem) + spk = gender_spk[1:] + gender = gender_spk[0] + utt_id = '_'.join([spk, gender, audio_id]) json_lines.append( json.dumps( { 'utt': audio_id, + 'utt2spk': spk, + 'utt2gender': gender, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text diff --git a/examples/dataset/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py index 36282bd6..373791bf 100644 --- a/examples/dataset/voxforge/voxforge.py +++ b/examples/dataset/voxforge/voxforge.py @@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path): audio_data, samplerate = soundfile.read(u) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(u))[0] json_lines.append( json.dumps({ - 'utt': os.path.splitext(os.path.basename(u))[0], + 'utt': utt, + 'utt2spk': speaker, 'feat': u, 'feat_shape': (duration, ), #second 'text': trans.lower() From 0defc658e109d8bd208961cfb868786fb64270ed Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 02:49:05 +0000 Subject: [PATCH 16/25] update aishell/librispeech transformer result; wenetspeech pretrain conformer result --- examples/aishell/s1/README.md | 10 +++ examples/aishell/s1/conf/transformer.yaml | 2 +- examples/aishell/s1/local/test_hub.sh | 2 - examples/librispeech/s1/README.md | 8 +-- examples/wenetspeech/README.md | 54 ++++++++++++++ examples/wenetspeech/asr1/.gitignore | 3 + examples/wenetspeech/asr1/README.md | 24 +++++++ examples/wenetspeech/asr1/local/data.sh | 0 examples/wenetspeech/asr1/local/test.sh | 70 ++++++++++++++++++- .../asr1/local/wenetspeech_data_prep.sh | 0 .../frontend/featurizer/text_featurizer.py | 4 +- paddlespeech/s2t/models/u2/u2.py | 2 +- 12 files changed, 169 insertions(+), 10 deletions(-) create mode 100644 examples/wenetspeech/README.md create mode 100644 examples/wenetspeech/asr1/.gitignore create mode 100644 examples/wenetspeech/asr1/README.md mode change 100644 => 100755 examples/wenetspeech/asr1/local/data.sh mode change 100644 => 100755 examples/wenetspeech/asr1/local/test.sh mode change 100644 => 100755 examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 0096c73e..8c53f95f 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | + + +## Transformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | \ No newline at end of file diff --git a/examples/aishell/s1/conf/transformer.yaml b/examples/aishell/s1/conf/transformer.yaml index 7803097a..c021f66b 100644 --- a/examples/aishell/s1/conf/transformer.yaml +++ b/examples/aishell/s1/conf/transformer.yaml @@ -73,7 +73,7 @@ model: training: - n_epoch: 240 + n_epoch: 120 accum_grad: 2 global_grad_clip: 5.0 optim: adam diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/s1/local/test_hub.sh index 99b141c8..6e78ec78 100755 --- a/examples/aishell/s1/local/test_hub.sh +++ b/examples/aishell/s1/local/test_hub.sh @@ -23,8 +23,6 @@ fi # exit 1 #fi - - for type in attention_rescoring; do echo "decoding ${type}" batch_size=1 diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index b7ec93eb..20255db8 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -21,7 +21,7 @@ ## Transformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | \ No newline at end of file diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md new file mode 100644 index 00000000..fbb322d6 --- /dev/null +++ b/examples/wenetspeech/README.md @@ -0,0 +1,54 @@ +# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) + +A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition + +## Description + +### Creation + +All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data. + +### Categories + +In summary, WenetSpeech groups all data into 3 categories, as the following table shows: + +| Set | Hours | Confidence | Usage | +|------------|-------|-------------|---------------------------------------| +| High Label | 10005 | >=0.95 | Supervised Training | +| Weak Label | 2478 | [0.6, 0.95] | Semi-supervised or noise training | +| Unlabel | 9952 | / | Unsupervised training or Pre-training | +| In Total | 22435 | / | All above | + +### High Label Data + +We classify the high label into 10 groups according to its domain, speaking style, and scenarios. + +| Domain | Youtube | Podcast | Total | +|-------------|---------|---------|--------| +| audiobook | 0 | 250.9 | 250.9 | +| commentary | 112.6 | 135.7 | 248.3 | +| documentary | 386.7 | 90.5 | 477.2 | +| drama | 4338.2 | 0 | 4338.2 | +| interview | 324.2 | 614 | 938.2 | +| news | 0 | 868 | 868 | +| reading | 0 | 1110.2 | 1110.2 | +| talk | 204 | 90.7 | 294.7 | +| variety | 603.3 | 224.5 | 827.8 | +| others | 144 | 507.5 | 651.5 | +| Total | 6113 | 3892 | 10005 | + +As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales. + +| Training Subsets | Confidence | Hours | +|------------------|-------------|-------| +| L | [0.95, 1.0] | 10005 | +| M | 1.0 | 1000 | +| S | 1.0 | 100 | + +### Evaluation Sets + +| Evaluation Sets | Hours | Source | Description | +|-----------------|-------|--------------|-----------------------------------------------------------------------------------------| +| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training | +| TEST\_NET | 23 | Internet | Match test | +| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore new file mode 100644 index 00000000..02a22922 --- /dev/null +++ b/examples/wenetspeech/asr1/.gitignore @@ -0,0 +1,3 @@ +data +exp +*.profile diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md new file mode 100644 index 00000000..5aff041f --- /dev/null +++ b/examples/wenetspeech/asr1/README.md @@ -0,0 +1,24 @@ +# WenetSpeech + + +## Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | | + + + +## Conformer Pretrain Model + +Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh old mode 100644 new mode 100755 index e7c64346..47bd2f63 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1 +1,69 @@ -decode_modes="attention_rescoring ctc_greedy_search" \ No newline at end of file +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +for type in attention ctc_greedy_search; do + echo "decoding ${type}" + if [ ${chunk_mode} == true ];then + # stream decoding only support batchsize=1 + batch_size=1 + else + batch_size=64 + fi + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +for type in ctc_prefix_beam_search attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 7f3bd9e1..21f512e9 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -92,7 +92,9 @@ class TextFeaturizer(): tokens = self.tokenize(text) ids = [] for token in tokens: - token = token if token in self.vocab_dict else self.unk + if token not in self.vocab_dict: + logger.debug(f"Text Token: {token} -> {self.unk}") + token = self.unk ids.append(self.vocab_dict[token]) return ids diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 916a6a05..4f833372 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -860,7 +860,7 @@ class U2Model(U2DecodeModel): int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ # cmvn - if configs['cmvn_file'] is not None: + if 'cmvn_file' in configs and configs['cmvn_file']: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) global_cmvn = GlobalCMVN( From e79e00a6b27504ca4962f56a6e9b71b56a12e9e7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 09:29:23 +0000 Subject: [PATCH 17/25] pack model --- examples/wenetspeech/asr1/README.md | 30 ++--- examples/wenetspeech/asr1/RESULTS.md | 24 ++++ examples/wenetspeech/asr1/utils | 1 + utils/pack_model.sh | 169 +++++++++++++++++++++++++++ utils/show_results.sh | 74 ++++++++++++ 5 files changed, 278 insertions(+), 20 deletions(-) create mode 100644 examples/wenetspeech/asr1/RESULTS.md create mode 120000 examples/wenetspeech/asr1/utils create mode 100755 utils/pack_model.sh create mode 100755 utils/show_results.sh diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md index 5aff041f..c08b94e2 100644 --- a/examples/wenetspeech/asr1/README.md +++ b/examples/wenetspeech/asr1/README.md @@ -1,24 +1,14 @@ -# WenetSpeech +## Pack Model +pack model to tar.gz, e.g. -## Conformer +```bash +./utils/pack_model.sh --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec +h.pdparams -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | | +``` - - -## Conformer Pretrain Model - -Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz - -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file +show model.tar.gz +``` +tar tf model.tar.gz +``` diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md new file mode 100644 index 00000000..5aff041f --- /dev/null +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -0,0 +1,24 @@ +# WenetSpeech + + +## Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | | + + + +## Conformer Pretrain Model + +Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/utils b/examples/wenetspeech/asr1/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/wenetspeech/asr1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/utils/pack_model.sh b/utils/pack_model.sh new file mode 100755 index 00000000..5bd40c84 --- /dev/null +++ b/utils/pack_model.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash + +# Copyright 2019 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +[ -f ./path.sh ] && . ./path.sh + +results="" +# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt +# exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"' +lm="" +dict="" +etc="" +outfile="model" +preprocess_conf="" + +help_message=$(cat < --dict , for example: +: exp/train_rnnlm/rnnlm.model.best +: data/lang_char +: conf/train.yaml +: conf/decode.yaml +: data/tr_it/cmvn.ark +: exp/tr_it_pytorch_train/results/model.last10.avg.best +EOF +) + +. utils/parse_options.sh + +echo $PWD +echo $dict + +if [ $# != 4 ]; then + echo "${help_message}" + exit 1 +fi + +tr_conf=$1 +dec_conf=$2 +cmvn=$3 +e2e=$4 + +echo " - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)" +echo " - model link: (put the model link manually.)" + +# configs +if [ -e ${tr_conf} ]; then + tar cfh ${outfile}.tar ${tr_conf} + echo -n " - training config file: \`" + echo ${tr_conf} | sed -e "s/$/\`/" +else + echo "missing ${tr_conf}" + exit 1 +fi +if [ -e ${dec_conf} ]; then + tar rfh ${outfile}.tar ${dec_conf} + echo -n " - decoding config file: \`" + echo ${dec_conf} | sed -e "s/$/\`/" +else + echo "missing ${dec_conf}" + exit 1 +fi +# NOTE(kan-bayashi): preprocess conf is optional +if [ -n "${preprocess_conf}" ]; then + tar rfh ${outfile}.tar ${preprocess_conf} + echo -n " - preprocess config file: \`" + echo ${preprocess_conf} | sed -e "s/$/\`/" +fi + +# cmvn +if [ -e ${cmvn} ]; then + tar rfh ${outfile}.tar ${cmvn} + echo -n " - cmvn file: \`" + echo ${cmvn} | sed -e "s/$/\`/" +else + echo "missing ${cmvn}" + exit 1 +fi + +# e2e +if [ -e ${e2e} ]; then + tar rfh ${outfile}.tar ${e2e} + echo -n " - e2e file: \`" + echo ${e2e} | sed -e "s/$/\`/" + + e2e_conf=$(dirname ${e2e})/model.json + if [ ! -e ${e2e_conf} ]; then + echo missing ${e2e_conf} + #exit 1 + else + echo -n " - e2e JSON file: \`" + echo ${e2e_conf} | sed -e "s/$/\`/" + tar rfh ${outfile}.tar ${e2e_conf} + fi +else + echo "missing ${e2e}" + exit 1 +fi + +# lm +if [ -n "${lm}" ]; then + if [ -e ${lm} ]; then + tar rfh ${outfile}.tar ${lm} + echo -n " - lm file: \`" + echo ${lm} | sed -e "s/$/\`/" + + lm_conf=$(dirname ${lm})/model.json + if [ ! -e ${lm_conf} ]; then + echo missing ${lm_conf} + exit 1 + else + echo -n " - lm JSON file: \`" + echo ${lm_conf} | sed -e "s/$/\`/" + tar rfh ${outfile}.tar ${lm_conf} + fi + else + echo "missing ${lm}" + exit 1 + fi +fi + +echo ${dict} +echo test -e ${dict} +# dict +if [ -n "${dict}" ]; then + if [ -e ${dict} ]; then + tar rfh ${outfile}.tar ${dict} + echo -n " - dict file: \`" + echo ${dict} | sed -e "s/$/\`/" + else + echo "missing ${dict}" + exit 1 + fi +fi + +# etc +for x in ${etc}; do + if [ -e ${x} ]; then + tar rfh ${outfile}.tar ${x} + echo -n " - etc file: \`" + echo ${x} | sed -e "s/$/\`/" + else + echo "missing ${x}" + exit 1 + fi +done + +# finally compress the tar file +gzip -f ${outfile}.tar + +# results +if [ -n "${results}" ]; then + echo " - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results \`)" + echo "\`\`\`" +fi +for x in ${results}; do + if [ -e ${x} ]; then + echo "${x}" + grep -e Avg -e SPKR -m 2 ${x} + else + echo "missing ${x}" + exit 1 + fi +done +if [ -n "${results}" ]; then + echo "\`\`\`" +fi + +exit 0 diff --git a/utils/show_results.sh b/utils/show_results.sh new file mode 100755 index 00000000..42f80ee6 --- /dev/null +++ b/utils/show_results.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +mindepth=0 +maxdepth=1 + +. utils/parse_options.sh + +if [ $# -gt 1 ]; then + echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2 + echo "" + echo "Show the system environments and the evaluation results in Markdown format." + echo 'The default of is "exp/".' + exit 1 +fi + +[ -f ./path.sh ] && . ./path.sh +set -euo pipefail +if [ $# -eq 1 ]; then + exp=$1 +else + exp=exp +fi + + +cat << EOF + +# RESULTS +## Environments +- date: \`$(LC_ALL=C date)\` +EOF + +python3 << EOF +import sys, paddle +pyversion = sys.version.replace('\n', ' ') + +print(f"""- python version: \`{pyversion}\` +- paddle version: \`paddle {paddle.__version__}\`""") +EOF + +cat << EOF +- Git hash: \`$(git rev-parse HEAD)\` + - Commit date: \`$(git log -1 --format='%cd')\` + +EOF + +while IFS= read -r expdir; do + if ls ${expdir}/decode_*/result.txt &> /dev/null; then + # 1. Show the result table + cat << EOF +## $(basename ${expdir}) +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +EOF + grep -e Avg ${expdir}/decode_*/result.txt \ + | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \ + | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' + echo + + # 2. Show the result table for WER + if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then + cat << EOF +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +EOF + grep -e Avg ${expdir}/decode_*/result.wrd.txt \ + | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \ + | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' + echo + fi + fi +done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d) From f89f99fe4a829a49b19565c6bd79e489bc4afb2b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 09:37:57 +0000 Subject: [PATCH 18/25] update pack model tool --- utils/pack_model.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/utils/pack_model.sh b/utils/pack_model.sh index 5bd40c84..8acd59a6 100755 --- a/utils/pack_model.sh +++ b/utils/pack_model.sh @@ -27,9 +27,6 @@ EOF . utils/parse_options.sh -echo $PWD -echo $dict - if [ $# != 4 ]; then echo "${help_message}" exit 1 @@ -119,8 +116,6 @@ if [ -n "${lm}" ]; then fi fi -echo ${dict} -echo test -e ${dict} # dict if [ -n "${dict}" ]; then if [ -e ${dict} ]; then From 02c7ef319898e33650aad90759c98712d3777cad Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 10:30:54 +0000 Subject: [PATCH 19/25] format data support multi output --- examples/ted_en_zh/t0/run.sh | 3 ++- utils/format_data.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh index e9f4a058..654d4dce 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/t0/run.sh @@ -2,6 +2,7 @@ set -e source path.sh +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/transformer_joint_noam.yaml @@ -21,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/utils/format_data.py b/utils/format_data.py index f9b5e6aa..2fa1924a 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -87,15 +87,24 @@ def main(): tokens = text_feature.tokenize(line) tokenids = text_feature.featurize(line) output_json['output'].append({ - 'name': 'traget1', + 'name': 'target1', 'shape': (len(tokenids), vocab_size), 'text': line, 'token': ' '.join(tokens), 'tokenid': ' '.join(map(str, tokenids)), }) else: - # isinstance(line, list), multi target - raise NotImplementedError("not support multi output now!") + # isinstance(line, list), multi target in one vocab + for i, item in enumerate(line, 1): + tokens = text_feature.tokenize(item) + tokenids = text_feature.featurize(item) + output_json['output'].append({ + 'name': f'target{i}', + 'shape': (len(tokenids), vocab_size), + 'text': item, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) # input line = line_json['feat'] From b944418d6ffb0fe492185cca2577e9d00d946ce7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 11:11:27 +0000 Subject: [PATCH 20/25] new format data support ds2/st --- examples/dataset/ted_en_zh/ted_en_zh.py | 7 +++++-- examples/ted_en_zh/t0/local/data.sh | 6 +++--- paddlespeech/s2t/io/collator.py | 11 ++++++----- paddlespeech/s2t/io/dataset.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 14bef01d..a8cbb837 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) + + + translation_str = " ".join(translation.split()) + trancription_str = " ".join(trancription.split()) json_lines.append( json.dumps( { 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': " ".join(translation.split()), - 'text1': " ".join(trancription.split()) + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index ce58f539..d3acbd44 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -9,7 +9,7 @@ stop_stage=100 nbpe=8000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" -data_dir=./TED_EnZh +data_dir=./TED-En-Zh source ${MAIN_ROOT}/utils/parse_options.sh @@ -21,7 +21,7 @@ mkdir -p data if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ! -e ${data_dir} ]; then - echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset" echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" echo "The tree of the directory should be:" echo "." @@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for set in train dev test; do { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ + python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index a500f10c..35b86871 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -237,8 +237,8 @@ class SpeechCollatorBase(): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - text = item['text'] + audio = item['input'][0]['feat'] + text = item['output'][0]['text'] audio, text = self.process_utterance(audio, text) audios.append(audio) # [T, D] @@ -381,9 +381,10 @@ class TripletSpeechCollator(SpeechCollator): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - translation = item['text'] - transcription = item['text1'] + audio = item['input'][0]['feat'] + translation = item['output'][0]['text'] + transcription = item['output'][1]['text'] + audio, translation, transcription = self.process_utterance( audio, translation, transcription) diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 7007518d..c5df2d6b 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -122,7 +122,7 @@ class ManifestDataset(Dataset): min_output_len=min_output_len, max_output_input_ratio=max_output_input_ratio, min_output_input_ratio=min_output_input_ratio) - self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._manifest.sort(key=lambda x: x["input"][0]["shape"][0]) def __len__(self): return len(self._manifest) From b57b865989c996bab3e74ea6abad1d28c386a1fd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 11:13:27 +0000 Subject: [PATCH 21/25] rename egs --- examples/callcenter/{s1 => asr1}/.gitignore | 0 examples/callcenter/{s1 => asr1}/README.md | 0 examples/callcenter/{s1 => asr1}/conf/augmentation.json | 0 examples/callcenter/{s1 => asr1}/conf/chunk_conformer.yaml | 0 examples/callcenter/{s1 => asr1}/conf/conformer.yaml | 0 examples/callcenter/{s1 => asr1}/conf/preprocess.yaml | 0 examples/callcenter/{s1 => asr1}/local/align.sh | 0 examples/callcenter/{s1 => asr1}/local/data.sh | 0 examples/callcenter/{s1 => asr1}/local/download_lm_ch.sh | 0 examples/callcenter/{s1 => asr1}/local/export.sh | 0 examples/callcenter/{s1 => asr1}/local/test.sh | 0 examples/callcenter/{s1 => asr1}/local/train.sh | 0 examples/callcenter/{s1 => asr1}/path.sh | 0 examples/callcenter/{s1 => asr1}/run.sh | 0 examples/ted_en_zh/t0/.gitignore | 2 +- examples/thchs30/README.md | 2 +- examples/thchs30/{a0 => align0}/README.md | 0 examples/thchs30/{a0 => align0}/data/dict/syllable.lexicon | 0 examples/thchs30/{a0 => align0}/local/data.sh | 0 examples/thchs30/{a0 => align0}/local/gen_word2phone.py | 0 examples/thchs30/{a0 => align0}/local/reorganize_thchs30.py | 0 examples/thchs30/{a0 => align0}/path.sh | 0 examples/thchs30/{a0 => align0}/run.sh | 0 examples/timit/README.md | 2 +- examples/timit/{s1 => asr1}/.gitignore | 0 examples/timit/{s1 => asr1}/README.md | 0 examples/timit/{s1 => asr1}/conf/augmentation.json | 0 examples/timit/{s1 => asr1}/conf/dev_spk.list | 0 examples/timit/{s1 => asr1}/conf/phones.60-48-39.map | 0 examples/timit/{s1 => asr1}/conf/preprocess.yaml | 0 examples/timit/{s1 => asr1}/conf/test_spk.list | 0 examples/timit/{s1 => asr1}/conf/transformer.yaml | 0 examples/timit/{s1 => asr1}/local/align.sh | 0 examples/timit/{s1 => asr1}/local/data.sh | 0 examples/timit/{s1 => asr1}/local/export.sh | 0 examples/timit/{s1 => asr1}/local/test.sh | 0 examples/timit/{s1 => asr1}/local/timit_data_prep.sh | 0 examples/timit/{s1 => asr1}/local/timit_norm_trans.pl | 0 examples/timit/{s1 => asr1}/local/train.sh | 0 examples/timit/{s1 => asr1}/path.sh | 0 examples/timit/{s1 => asr1}/run.sh | 0 examples/wenetspeech/README.md | 2 ++ 42 files changed, 5 insertions(+), 3 deletions(-) rename examples/callcenter/{s1 => asr1}/.gitignore (100%) rename examples/callcenter/{s1 => asr1}/README.md (100%) rename examples/callcenter/{s1 => asr1}/conf/augmentation.json (100%) rename examples/callcenter/{s1 => asr1}/conf/chunk_conformer.yaml (100%) rename examples/callcenter/{s1 => asr1}/conf/conformer.yaml (100%) rename examples/callcenter/{s1 => asr1}/conf/preprocess.yaml (100%) rename examples/callcenter/{s1 => asr1}/local/align.sh (100%) rename examples/callcenter/{s1 => asr1}/local/data.sh (100%) rename examples/callcenter/{s1 => asr1}/local/download_lm_ch.sh (100%) rename examples/callcenter/{s1 => asr1}/local/export.sh (100%) rename examples/callcenter/{s1 => asr1}/local/test.sh (100%) rename examples/callcenter/{s1 => asr1}/local/train.sh (100%) rename examples/callcenter/{s1 => asr1}/path.sh (100%) rename examples/callcenter/{s1 => asr1}/run.sh (100%) rename examples/thchs30/{a0 => align0}/README.md (100%) rename examples/thchs30/{a0 => align0}/data/dict/syllable.lexicon (100%) rename examples/thchs30/{a0 => align0}/local/data.sh (100%) rename examples/thchs30/{a0 => align0}/local/gen_word2phone.py (100%) rename examples/thchs30/{a0 => align0}/local/reorganize_thchs30.py (100%) rename examples/thchs30/{a0 => align0}/path.sh (100%) rename examples/thchs30/{a0 => align0}/run.sh (100%) rename examples/timit/{s1 => asr1}/.gitignore (100%) rename examples/timit/{s1 => asr1}/README.md (100%) rename examples/timit/{s1 => asr1}/conf/augmentation.json (100%) rename examples/timit/{s1 => asr1}/conf/dev_spk.list (100%) rename examples/timit/{s1 => asr1}/conf/phones.60-48-39.map (100%) rename examples/timit/{s1 => asr1}/conf/preprocess.yaml (100%) rename examples/timit/{s1 => asr1}/conf/test_spk.list (100%) rename examples/timit/{s1 => asr1}/conf/transformer.yaml (100%) rename examples/timit/{s1 => asr1}/local/align.sh (100%) rename examples/timit/{s1 => asr1}/local/data.sh (100%) rename examples/timit/{s1 => asr1}/local/export.sh (100%) rename examples/timit/{s1 => asr1}/local/test.sh (100%) rename examples/timit/{s1 => asr1}/local/timit_data_prep.sh (100%) rename examples/timit/{s1 => asr1}/local/timit_norm_trans.pl (100%) rename examples/timit/{s1 => asr1}/local/train.sh (100%) rename examples/timit/{s1 => asr1}/path.sh (100%) rename examples/timit/{s1 => asr1}/run.sh (100%) diff --git a/examples/callcenter/s1/.gitignore b/examples/callcenter/asr1/.gitignore similarity index 100% rename from examples/callcenter/s1/.gitignore rename to examples/callcenter/asr1/.gitignore diff --git a/examples/callcenter/s1/README.md b/examples/callcenter/asr1/README.md similarity index 100% rename from examples/callcenter/s1/README.md rename to examples/callcenter/asr1/README.md diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/asr1/conf/augmentation.json similarity index 100% rename from examples/callcenter/s1/conf/augmentation.json rename to examples/callcenter/asr1/conf/augmentation.json diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml similarity index 100% rename from examples/callcenter/s1/conf/chunk_conformer.yaml rename to examples/callcenter/asr1/conf/chunk_conformer.yaml diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml similarity index 100% rename from examples/callcenter/s1/conf/conformer.yaml rename to examples/callcenter/asr1/conf/conformer.yaml diff --git a/examples/callcenter/s1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml similarity index 100% rename from examples/callcenter/s1/conf/preprocess.yaml rename to examples/callcenter/asr1/conf/preprocess.yaml diff --git a/examples/callcenter/s1/local/align.sh b/examples/callcenter/asr1/local/align.sh similarity index 100% rename from examples/callcenter/s1/local/align.sh rename to examples/callcenter/asr1/local/align.sh diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/asr1/local/data.sh similarity index 100% rename from examples/callcenter/s1/local/data.sh rename to examples/callcenter/asr1/local/data.sh diff --git a/examples/callcenter/s1/local/download_lm_ch.sh b/examples/callcenter/asr1/local/download_lm_ch.sh similarity index 100% rename from examples/callcenter/s1/local/download_lm_ch.sh rename to examples/callcenter/asr1/local/download_lm_ch.sh diff --git a/examples/callcenter/s1/local/export.sh b/examples/callcenter/asr1/local/export.sh similarity index 100% rename from examples/callcenter/s1/local/export.sh rename to examples/callcenter/asr1/local/export.sh diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/asr1/local/test.sh similarity index 100% rename from examples/callcenter/s1/local/test.sh rename to examples/callcenter/asr1/local/test.sh diff --git a/examples/callcenter/s1/local/train.sh b/examples/callcenter/asr1/local/train.sh similarity index 100% rename from examples/callcenter/s1/local/train.sh rename to examples/callcenter/asr1/local/train.sh diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/asr1/path.sh similarity index 100% rename from examples/callcenter/s1/path.sh rename to examples/callcenter/asr1/path.sh diff --git a/examples/callcenter/s1/run.sh b/examples/callcenter/asr1/run.sh similarity index 100% rename from examples/callcenter/s1/run.sh rename to examples/callcenter/asr1/run.sh diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/t0/.gitignore index 123e5174..469c6171 100644 --- a/examples/ted_en_zh/t0/.gitignore +++ b/examples/ted_en_zh/t0/.gitignore @@ -1,3 +1,3 @@ -TED_EnZh +TED-En-Zh data exp diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md index 7b3cc3d9..9a0026a0 100644 --- a/examples/thchs30/README.md +++ b/examples/thchs30/README.md @@ -1,3 +1,3 @@ # thchs30 -* a0 for mfa alignment +* align0 - mfa alignment diff --git a/examples/thchs30/a0/README.md b/examples/thchs30/align0/README.md similarity index 100% rename from examples/thchs30/a0/README.md rename to examples/thchs30/align0/README.md diff --git a/examples/thchs30/a0/data/dict/syllable.lexicon b/examples/thchs30/align0/data/dict/syllable.lexicon similarity index 100% rename from examples/thchs30/a0/data/dict/syllable.lexicon rename to examples/thchs30/align0/data/dict/syllable.lexicon diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/align0/local/data.sh similarity index 100% rename from examples/thchs30/a0/local/data.sh rename to examples/thchs30/align0/local/data.sh diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/align0/local/gen_word2phone.py similarity index 100% rename from examples/thchs30/a0/local/gen_word2phone.py rename to examples/thchs30/align0/local/gen_word2phone.py diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/align0/local/reorganize_thchs30.py similarity index 100% rename from examples/thchs30/a0/local/reorganize_thchs30.py rename to examples/thchs30/align0/local/reorganize_thchs30.py diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/align0/path.sh similarity index 100% rename from examples/thchs30/a0/path.sh rename to examples/thchs30/align0/path.sh diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/align0/run.sh similarity index 100% rename from examples/thchs30/a0/run.sh rename to examples/thchs30/align0/run.sh diff --git a/examples/timit/README.md b/examples/timit/README.md index b7c8b754..87f1858f 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -1,3 +1,3 @@ # TIMIT -* s1 u2 model with phone unit +* asr1 - u2 model with phone unit diff --git a/examples/timit/s1/.gitignore b/examples/timit/asr1/.gitignore similarity index 100% rename from examples/timit/s1/.gitignore rename to examples/timit/asr1/.gitignore diff --git a/examples/timit/s1/README.md b/examples/timit/asr1/README.md similarity index 100% rename from examples/timit/s1/README.md rename to examples/timit/asr1/README.md diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/asr1/conf/augmentation.json similarity index 100% rename from examples/timit/s1/conf/augmentation.json rename to examples/timit/asr1/conf/augmentation.json diff --git a/examples/timit/s1/conf/dev_spk.list b/examples/timit/asr1/conf/dev_spk.list similarity index 100% rename from examples/timit/s1/conf/dev_spk.list rename to examples/timit/asr1/conf/dev_spk.list diff --git a/examples/timit/s1/conf/phones.60-48-39.map b/examples/timit/asr1/conf/phones.60-48-39.map similarity index 100% rename from examples/timit/s1/conf/phones.60-48-39.map rename to examples/timit/asr1/conf/phones.60-48-39.map diff --git a/examples/timit/s1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml similarity index 100% rename from examples/timit/s1/conf/preprocess.yaml rename to examples/timit/asr1/conf/preprocess.yaml diff --git a/examples/timit/s1/conf/test_spk.list b/examples/timit/asr1/conf/test_spk.list similarity index 100% rename from examples/timit/s1/conf/test_spk.list rename to examples/timit/asr1/conf/test_spk.list diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml similarity index 100% rename from examples/timit/s1/conf/transformer.yaml rename to examples/timit/asr1/conf/transformer.yaml diff --git a/examples/timit/s1/local/align.sh b/examples/timit/asr1/local/align.sh similarity index 100% rename from examples/timit/s1/local/align.sh rename to examples/timit/asr1/local/align.sh diff --git a/examples/timit/s1/local/data.sh b/examples/timit/asr1/local/data.sh similarity index 100% rename from examples/timit/s1/local/data.sh rename to examples/timit/asr1/local/data.sh diff --git a/examples/timit/s1/local/export.sh b/examples/timit/asr1/local/export.sh similarity index 100% rename from examples/timit/s1/local/export.sh rename to examples/timit/asr1/local/export.sh diff --git a/examples/timit/s1/local/test.sh b/examples/timit/asr1/local/test.sh similarity index 100% rename from examples/timit/s1/local/test.sh rename to examples/timit/asr1/local/test.sh diff --git a/examples/timit/s1/local/timit_data_prep.sh b/examples/timit/asr1/local/timit_data_prep.sh similarity index 100% rename from examples/timit/s1/local/timit_data_prep.sh rename to examples/timit/asr1/local/timit_data_prep.sh diff --git a/examples/timit/s1/local/timit_norm_trans.pl b/examples/timit/asr1/local/timit_norm_trans.pl similarity index 100% rename from examples/timit/s1/local/timit_norm_trans.pl rename to examples/timit/asr1/local/timit_norm_trans.pl diff --git a/examples/timit/s1/local/train.sh b/examples/timit/asr1/local/train.sh similarity index 100% rename from examples/timit/s1/local/train.sh rename to examples/timit/asr1/local/train.sh diff --git a/examples/timit/s1/path.sh b/examples/timit/asr1/path.sh similarity index 100% rename from examples/timit/s1/path.sh rename to examples/timit/asr1/path.sh diff --git a/examples/timit/s1/run.sh b/examples/timit/asr1/run.sh similarity index 100% rename from examples/timit/s1/run.sh rename to examples/timit/asr1/run.sh diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md index fbb322d6..3d93677f 100644 --- a/examples/wenetspeech/README.md +++ b/examples/wenetspeech/README.md @@ -1,5 +1,7 @@ # [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) +* asr1 - u2 asr model + A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition ## Description From 2f4f74407131e3bbf09cd03efeb621347ecd0981 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 11:30:54 +0000 Subject: [PATCH 22/25] rename asr egs --- examples/aishell/README.md | 6 ++++-- examples/aishell/{s0 => asr0}/.gitignore | 0 examples/aishell/{s0 => asr0}/README.md | 0 examples/aishell/{s0 => asr0}/conf/augmentation.json | 0 examples/aishell/{s0 => asr0}/conf/deepspeech2.yaml | 0 examples/aishell/{s0 => asr0}/conf/deepspeech2_online.yaml | 0 examples/aishell/{s0 => asr0}/local/data.sh | 0 examples/aishell/{s0 => asr0}/local/download_lm_ch.sh | 0 examples/aishell/{s0 => asr0}/local/export.sh | 0 examples/aishell/{s0 => asr0}/local/test.sh | 0 examples/aishell/{s0 => asr0}/local/test_export.sh | 0 examples/aishell/{s0 => asr0}/local/test_hub.sh | 0 examples/aishell/{s0 => asr0}/local/train.sh | 0 examples/aishell/{s0 => asr0}/path.sh | 0 examples/aishell/{s0 => asr0}/run.sh | 0 examples/aishell/{s1 => asr1}/.gitignore | 0 examples/aishell/{s1 => asr1}/README.md | 0 examples/aishell/{s1 => asr1}/conf/augmentation.json | 0 examples/aishell/{s1 => asr1}/conf/chunk_conformer.yaml | 0 examples/aishell/{s1 => asr1}/conf/conformer.yaml | 0 examples/aishell/{s1 => asr1}/conf/preprocess.yaml | 0 examples/aishell/{s1 => asr1}/conf/transformer.yaml | 0 examples/aishell/{s1 => asr1}/local/aishell_train_lms.sh | 0 examples/aishell/{s1 => asr1}/local/align.sh | 0 examples/aishell/{s1 => asr1}/local/data.sh | 0 examples/aishell/{s1 => asr1}/local/export.sh | 0 examples/aishell/{s1 => asr1}/local/test.sh | 0 examples/aishell/{s1 => asr1}/local/test_hub.sh | 0 examples/aishell/{s1 => asr1}/local/tlg.sh | 0 examples/aishell/{s1 => asr1}/local/train.sh | 0 examples/aishell/{s1 => asr1}/path.sh | 0 examples/aishell/{s1 => asr1}/run.sh | 0 examples/aishell/{s1 => asr1}/utils | 0 examples/librispeech/README.md | 7 ++++--- examples/librispeech/{s0 => asr0}/README.md | 0 examples/librispeech/{s0 => asr0}/conf/augmentation.json | 0 examples/librispeech/{s0 => asr0}/conf/deepspeech2.yaml | 0 .../librispeech/{s0 => asr0}/conf/deepspeech2_online.yaml | 0 examples/librispeech/{s0 => asr0}/local/data.sh | 0 examples/librispeech/{s0 => asr0}/local/download_lm_en.sh | 0 examples/librispeech/{s0 => asr0}/local/export.sh | 0 examples/librispeech/{s0 => asr0}/local/test.sh | 0 examples/librispeech/{s0 => asr0}/local/test_hub.sh | 0 examples/librispeech/{s0 => asr0}/local/train.sh | 0 examples/librispeech/{s0 => asr0}/path.sh | 0 examples/librispeech/{s0 => asr0}/run.sh | 0 examples/librispeech/{s1 => asr1}/.gitignore | 0 examples/librispeech/{s1 => asr1}/README.md | 0 examples/librispeech/{s1 => asr1}/cmd.sh | 0 examples/librispeech/{s1 => asr1}/conf/augmentation.json | 0 .../librispeech/{s1 => asr1}/conf/chunk_conformer.yaml | 0 .../librispeech/{s1 => asr1}/conf/chunk_transformer.yaml | 0 examples/librispeech/{s1 => asr1}/conf/conformer.yaml | 0 examples/librispeech/{s1 => asr1}/conf/preprocess.yaml | 0 examples/librispeech/{s1 => asr1}/conf/transformer.yaml | 0 examples/librispeech/{s1 => asr1}/local/align.sh | 0 examples/librispeech/{s1 => asr1}/local/data.sh | 0 examples/librispeech/{s1 => asr1}/local/download_lm_en.sh | 0 examples/librispeech/{s1 => asr1}/local/export.sh | 0 examples/librispeech/{s1 => asr1}/local/test.sh | 0 examples/librispeech/{s1 => asr1}/local/test_hub.sh | 0 examples/librispeech/{s1 => asr1}/local/train.sh | 0 examples/librispeech/{s1 => asr1}/path.sh | 0 examples/librispeech/{s1 => asr1}/run.sh | 0 examples/librispeech/{s1 => asr1}/utils | 0 examples/librispeech/{s2 => asr2}/.gitignore | 0 examples/librispeech/{s2 => asr2}/README.md | 0 examples/librispeech/{s2 => asr2}/cmd.sh | 0 examples/librispeech/{s2 => asr2}/conf/augmentation.json | 0 examples/librispeech/{s2 => asr2}/conf/decode/decode.yaml | 0 .../librispeech/{s2 => asr2}/conf/decode/decode_att.yaml | 0 .../librispeech/{s2 => asr2}/conf/decode/decode_ctc.yaml | 0 .../librispeech/{s2 => asr2}/conf/decode/decode_wo_lm.yaml | 0 examples/librispeech/{s2 => asr2}/conf/fbank.conf | 0 examples/librispeech/{s2 => asr2}/conf/lm/transformer.yaml | 0 examples/librispeech/{s2 => asr2}/conf/pitch.conf | 0 examples/librispeech/{s2 => asr2}/conf/transformer.yaml | 0 examples/librispeech/{s2 => asr2}/local/align.sh | 0 examples/librispeech/{s2 => asr2}/local/cacu_perplexity.sh | 0 examples/librispeech/{s2 => asr2}/local/data.sh | 0 examples/librispeech/{s2 => asr2}/local/data_prep.sh | 0 examples/librispeech/{s2 => asr2}/local/download_lm_en.sh | 0 .../{s2 => asr2}/local/espnet_json_to_manifest.py | 0 examples/librispeech/{s2 => asr2}/local/export.sh | 0 examples/librispeech/{s2 => asr2}/local/recog.sh | 0 examples/librispeech/{s2 => asr2}/local/test.sh | 0 examples/librispeech/{s2 => asr2}/local/train.sh | 0 examples/librispeech/{s2 => asr2}/path.sh | 0 examples/librispeech/{s2 => asr2}/run.sh | 0 examples/librispeech/{s2 => asr2}/steps | 0 examples/librispeech/{s2 => asr2}/utils | 0 examples/ted_en_zh/README.md | 2 +- examples/ted_en_zh/{t0 => st0}/.gitignore | 0 examples/ted_en_zh/{t0 => st0}/README.md | 0 examples/ted_en_zh/{t0 => st0}/conf/transformer.yaml | 0 .../ted_en_zh/{t0 => st0}/conf/transformer_joint_noam.yaml | 0 examples/ted_en_zh/{t0 => st0}/local/data.sh | 0 examples/ted_en_zh/{t0 => st0}/local/test.sh | 0 examples/ted_en_zh/{t0 => st0}/local/train.sh | 0 examples/ted_en_zh/{t0 => st0}/path.sh | 0 examples/ted_en_zh/{t0 => st0}/run.sh | 0 examples/timit/README.md | 6 +++++- examples/tiny/README.md | 5 +++-- examples/tiny/{s0 => asr0}/.gitignore | 0 examples/tiny/{s0 => asr0}/README.md | 0 examples/tiny/{s0 => asr0}/conf/augmentation.json | 0 examples/tiny/{s0 => asr0}/conf/deepspeech2.yaml | 0 examples/tiny/{s0 => asr0}/conf/deepspeech2_online.yaml | 0 examples/tiny/{s0 => asr0}/local/data.sh | 0 examples/tiny/{s0 => asr0}/local/download_lm_en.sh | 0 examples/tiny/{s0 => asr0}/local/export.sh | 0 examples/tiny/{s0 => asr0}/local/test.sh | 0 examples/tiny/{s0 => asr0}/local/train.sh | 0 examples/tiny/{s0 => asr0}/path.sh | 0 examples/tiny/{s0 => asr0}/run.sh | 0 examples/tiny/{s1 => asr1}/.gitignore | 0 examples/tiny/{s1 => asr1}/conf/augmentation.json | 0 examples/tiny/{s1 => asr1}/conf/chunk_confermer.yaml | 0 examples/tiny/{s1 => asr1}/conf/chunk_transformer.yaml | 0 examples/tiny/{s1 => asr1}/conf/conformer.yaml | 0 examples/tiny/{s1 => asr1}/conf/preprocess.yaml | 0 examples/tiny/{s1 => asr1}/conf/transformer.yaml | 0 examples/tiny/{s1 => asr1}/local/align.sh | 0 examples/tiny/{s1 => asr1}/local/data.sh | 0 examples/tiny/{s1 => asr1}/local/export.sh | 0 examples/tiny/{s1 => asr1}/local/test.sh | 0 examples/tiny/{s1 => asr1}/local/train.sh | 0 examples/tiny/{s1 => asr1}/path.sh | 0 examples/tiny/{s1 => asr1}/run.sh | 0 examples/wenetspeech/README.md | 6 ++++-- 130 files changed, 21 insertions(+), 11 deletions(-) rename examples/aishell/{s0 => asr0}/.gitignore (100%) rename examples/aishell/{s0 => asr0}/README.md (100%) rename examples/aishell/{s0 => asr0}/conf/augmentation.json (100%) rename examples/aishell/{s0 => asr0}/conf/deepspeech2.yaml (100%) rename examples/aishell/{s0 => asr0}/conf/deepspeech2_online.yaml (100%) rename examples/aishell/{s0 => asr0}/local/data.sh (100%) rename examples/aishell/{s0 => asr0}/local/download_lm_ch.sh (100%) rename examples/aishell/{s0 => asr0}/local/export.sh (100%) rename examples/aishell/{s0 => asr0}/local/test.sh (100%) rename examples/aishell/{s0 => asr0}/local/test_export.sh (100%) rename examples/aishell/{s0 => asr0}/local/test_hub.sh (100%) rename examples/aishell/{s0 => asr0}/local/train.sh (100%) rename examples/aishell/{s0 => asr0}/path.sh (100%) rename examples/aishell/{s0 => asr0}/run.sh (100%) rename examples/aishell/{s1 => asr1}/.gitignore (100%) rename examples/aishell/{s1 => asr1}/README.md (100%) rename examples/aishell/{s1 => asr1}/conf/augmentation.json (100%) rename examples/aishell/{s1 => asr1}/conf/chunk_conformer.yaml (100%) rename examples/aishell/{s1 => asr1}/conf/conformer.yaml (100%) rename examples/aishell/{s1 => asr1}/conf/preprocess.yaml (100%) rename examples/aishell/{s1 => asr1}/conf/transformer.yaml (100%) rename examples/aishell/{s1 => asr1}/local/aishell_train_lms.sh (100%) rename examples/aishell/{s1 => asr1}/local/align.sh (100%) rename examples/aishell/{s1 => asr1}/local/data.sh (100%) rename examples/aishell/{s1 => asr1}/local/export.sh (100%) rename examples/aishell/{s1 => asr1}/local/test.sh (100%) rename examples/aishell/{s1 => asr1}/local/test_hub.sh (100%) rename examples/aishell/{s1 => asr1}/local/tlg.sh (100%) rename examples/aishell/{s1 => asr1}/local/train.sh (100%) rename examples/aishell/{s1 => asr1}/path.sh (100%) rename examples/aishell/{s1 => asr1}/run.sh (100%) rename examples/aishell/{s1 => asr1}/utils (100%) rename examples/librispeech/{s0 => asr0}/README.md (100%) rename examples/librispeech/{s0 => asr0}/conf/augmentation.json (100%) rename examples/librispeech/{s0 => asr0}/conf/deepspeech2.yaml (100%) rename examples/librispeech/{s0 => asr0}/conf/deepspeech2_online.yaml (100%) rename examples/librispeech/{s0 => asr0}/local/data.sh (100%) rename examples/librispeech/{s0 => asr0}/local/download_lm_en.sh (100%) rename examples/librispeech/{s0 => asr0}/local/export.sh (100%) rename examples/librispeech/{s0 => asr0}/local/test.sh (100%) rename examples/librispeech/{s0 => asr0}/local/test_hub.sh (100%) rename examples/librispeech/{s0 => asr0}/local/train.sh (100%) rename examples/librispeech/{s0 => asr0}/path.sh (100%) rename examples/librispeech/{s0 => asr0}/run.sh (100%) rename examples/librispeech/{s1 => asr1}/.gitignore (100%) rename examples/librispeech/{s1 => asr1}/README.md (100%) rename examples/librispeech/{s1 => asr1}/cmd.sh (100%) rename examples/librispeech/{s1 => asr1}/conf/augmentation.json (100%) rename examples/librispeech/{s1 => asr1}/conf/chunk_conformer.yaml (100%) rename examples/librispeech/{s1 => asr1}/conf/chunk_transformer.yaml (100%) rename examples/librispeech/{s1 => asr1}/conf/conformer.yaml (100%) rename examples/librispeech/{s1 => asr1}/conf/preprocess.yaml (100%) rename examples/librispeech/{s1 => asr1}/conf/transformer.yaml (100%) rename examples/librispeech/{s1 => asr1}/local/align.sh (100%) rename examples/librispeech/{s1 => asr1}/local/data.sh (100%) rename examples/librispeech/{s1 => asr1}/local/download_lm_en.sh (100%) rename examples/librispeech/{s1 => asr1}/local/export.sh (100%) rename examples/librispeech/{s1 => asr1}/local/test.sh (100%) rename examples/librispeech/{s1 => asr1}/local/test_hub.sh (100%) rename examples/librispeech/{s1 => asr1}/local/train.sh (100%) rename examples/librispeech/{s1 => asr1}/path.sh (100%) rename examples/librispeech/{s1 => asr1}/run.sh (100%) rename examples/librispeech/{s1 => asr1}/utils (100%) rename examples/librispeech/{s2 => asr2}/.gitignore (100%) rename examples/librispeech/{s2 => asr2}/README.md (100%) rename examples/librispeech/{s2 => asr2}/cmd.sh (100%) rename examples/librispeech/{s2 => asr2}/conf/augmentation.json (100%) rename examples/librispeech/{s2 => asr2}/conf/decode/decode.yaml (100%) rename examples/librispeech/{s2 => asr2}/conf/decode/decode_att.yaml (100%) rename examples/librispeech/{s2 => asr2}/conf/decode/decode_ctc.yaml (100%) rename examples/librispeech/{s2 => asr2}/conf/decode/decode_wo_lm.yaml (100%) rename examples/librispeech/{s2 => asr2}/conf/fbank.conf (100%) rename examples/librispeech/{s2 => asr2}/conf/lm/transformer.yaml (100%) rename examples/librispeech/{s2 => asr2}/conf/pitch.conf (100%) rename examples/librispeech/{s2 => asr2}/conf/transformer.yaml (100%) rename examples/librispeech/{s2 => asr2}/local/align.sh (100%) rename examples/librispeech/{s2 => asr2}/local/cacu_perplexity.sh (100%) rename examples/librispeech/{s2 => asr2}/local/data.sh (100%) rename examples/librispeech/{s2 => asr2}/local/data_prep.sh (100%) rename examples/librispeech/{s2 => asr2}/local/download_lm_en.sh (100%) rename examples/librispeech/{s2 => asr2}/local/espnet_json_to_manifest.py (100%) rename examples/librispeech/{s2 => asr2}/local/export.sh (100%) rename examples/librispeech/{s2 => asr2}/local/recog.sh (100%) rename examples/librispeech/{s2 => asr2}/local/test.sh (100%) rename examples/librispeech/{s2 => asr2}/local/train.sh (100%) rename examples/librispeech/{s2 => asr2}/path.sh (100%) rename examples/librispeech/{s2 => asr2}/run.sh (100%) rename examples/librispeech/{s2 => asr2}/steps (100%) rename examples/librispeech/{s2 => asr2}/utils (100%) rename examples/ted_en_zh/{t0 => st0}/.gitignore (100%) rename examples/ted_en_zh/{t0 => st0}/README.md (100%) rename examples/ted_en_zh/{t0 => st0}/conf/transformer.yaml (100%) rename examples/ted_en_zh/{t0 => st0}/conf/transformer_joint_noam.yaml (100%) rename examples/ted_en_zh/{t0 => st0}/local/data.sh (100%) rename examples/ted_en_zh/{t0 => st0}/local/test.sh (100%) rename examples/ted_en_zh/{t0 => st0}/local/train.sh (100%) rename examples/ted_en_zh/{t0 => st0}/path.sh (100%) rename examples/ted_en_zh/{t0 => st0}/run.sh (100%) rename examples/tiny/{s0 => asr0}/.gitignore (100%) rename examples/tiny/{s0 => asr0}/README.md (100%) rename examples/tiny/{s0 => asr0}/conf/augmentation.json (100%) rename examples/tiny/{s0 => asr0}/conf/deepspeech2.yaml (100%) rename examples/tiny/{s0 => asr0}/conf/deepspeech2_online.yaml (100%) rename examples/tiny/{s0 => asr0}/local/data.sh (100%) rename examples/tiny/{s0 => asr0}/local/download_lm_en.sh (100%) rename examples/tiny/{s0 => asr0}/local/export.sh (100%) rename examples/tiny/{s0 => asr0}/local/test.sh (100%) rename examples/tiny/{s0 => asr0}/local/train.sh (100%) rename examples/tiny/{s0 => asr0}/path.sh (100%) rename examples/tiny/{s0 => asr0}/run.sh (100%) rename examples/tiny/{s1 => asr1}/.gitignore (100%) rename examples/tiny/{s1 => asr1}/conf/augmentation.json (100%) rename examples/tiny/{s1 => asr1}/conf/chunk_confermer.yaml (100%) rename examples/tiny/{s1 => asr1}/conf/chunk_transformer.yaml (100%) rename examples/tiny/{s1 => asr1}/conf/conformer.yaml (100%) rename examples/tiny/{s1 => asr1}/conf/preprocess.yaml (100%) rename examples/tiny/{s1 => asr1}/conf/transformer.yaml (100%) rename examples/tiny/{s1 => asr1}/local/align.sh (100%) rename examples/tiny/{s1 => asr1}/local/data.sh (100%) rename examples/tiny/{s1 => asr1}/local/export.sh (100%) rename examples/tiny/{s1 => asr1}/local/test.sh (100%) rename examples/tiny/{s1 => asr1}/local/train.sh (100%) rename examples/tiny/{s1 => asr1}/path.sh (100%) rename examples/tiny/{s1 => asr1}/run.sh (100%) diff --git a/examples/aishell/README.md b/examples/aishell/README.md index 82ef91da..a9bba074 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -1,7 +1,9 @@ # ASR -* s0 for deepspeech2 -* s1 for u2/transformer/conformer +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + ## Data diff --git a/examples/aishell/s0/.gitignore b/examples/aishell/asr0/.gitignore similarity index 100% rename from examples/aishell/s0/.gitignore rename to examples/aishell/asr0/.gitignore diff --git a/examples/aishell/s0/README.md b/examples/aishell/asr0/README.md similarity index 100% rename from examples/aishell/s0/README.md rename to examples/aishell/asr0/README.md diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/asr0/conf/augmentation.json similarity index 100% rename from examples/aishell/s0/conf/augmentation.json rename to examples/aishell/asr0/conf/augmentation.json diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/aishell/s0/conf/deepspeech2.yaml rename to examples/aishell/asr0/conf/deepspeech2.yaml diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/aishell/s0/conf/deepspeech2_online.yaml rename to examples/aishell/asr0/conf/deepspeech2_online.yaml diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/asr0/local/data.sh similarity index 100% rename from examples/aishell/s0/local/data.sh rename to examples/aishell/asr0/local/data.sh diff --git a/examples/aishell/s0/local/download_lm_ch.sh b/examples/aishell/asr0/local/download_lm_ch.sh similarity index 100% rename from examples/aishell/s0/local/download_lm_ch.sh rename to examples/aishell/asr0/local/download_lm_ch.sh diff --git a/examples/aishell/s0/local/export.sh b/examples/aishell/asr0/local/export.sh similarity index 100% rename from examples/aishell/s0/local/export.sh rename to examples/aishell/asr0/local/export.sh diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/asr0/local/test.sh similarity index 100% rename from examples/aishell/s0/local/test.sh rename to examples/aishell/asr0/local/test.sh diff --git a/examples/aishell/s0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh similarity index 100% rename from examples/aishell/s0/local/test_export.sh rename to examples/aishell/asr0/local/test_export.sh diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh similarity index 100% rename from examples/aishell/s0/local/test_hub.sh rename to examples/aishell/asr0/local/test_hub.sh diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/asr0/local/train.sh similarity index 100% rename from examples/aishell/s0/local/train.sh rename to examples/aishell/asr0/local/train.sh diff --git a/examples/aishell/s0/path.sh b/examples/aishell/asr0/path.sh similarity index 100% rename from examples/aishell/s0/path.sh rename to examples/aishell/asr0/path.sh diff --git a/examples/aishell/s0/run.sh b/examples/aishell/asr0/run.sh similarity index 100% rename from examples/aishell/s0/run.sh rename to examples/aishell/asr0/run.sh diff --git a/examples/aishell/s1/.gitignore b/examples/aishell/asr1/.gitignore similarity index 100% rename from examples/aishell/s1/.gitignore rename to examples/aishell/asr1/.gitignore diff --git a/examples/aishell/s1/README.md b/examples/aishell/asr1/README.md similarity index 100% rename from examples/aishell/s1/README.md rename to examples/aishell/asr1/README.md diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/asr1/conf/augmentation.json similarity index 100% rename from examples/aishell/s1/conf/augmentation.json rename to examples/aishell/asr1/conf/augmentation.json diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml similarity index 100% rename from examples/aishell/s1/conf/chunk_conformer.yaml rename to examples/aishell/asr1/conf/chunk_conformer.yaml diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml similarity index 100% rename from examples/aishell/s1/conf/conformer.yaml rename to examples/aishell/asr1/conf/conformer.yaml diff --git a/examples/aishell/s1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml similarity index 100% rename from examples/aishell/s1/conf/preprocess.yaml rename to examples/aishell/asr1/conf/preprocess.yaml diff --git a/examples/aishell/s1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml similarity index 100% rename from examples/aishell/s1/conf/transformer.yaml rename to examples/aishell/asr1/conf/transformer.yaml diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/asr1/local/aishell_train_lms.sh similarity index 100% rename from examples/aishell/s1/local/aishell_train_lms.sh rename to examples/aishell/asr1/local/aishell_train_lms.sh diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/asr1/local/align.sh similarity index 100% rename from examples/aishell/s1/local/align.sh rename to examples/aishell/asr1/local/align.sh diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/asr1/local/data.sh similarity index 100% rename from examples/aishell/s1/local/data.sh rename to examples/aishell/asr1/local/data.sh diff --git a/examples/aishell/s1/local/export.sh b/examples/aishell/asr1/local/export.sh similarity index 100% rename from examples/aishell/s1/local/export.sh rename to examples/aishell/asr1/local/export.sh diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/asr1/local/test.sh similarity index 100% rename from examples/aishell/s1/local/test.sh rename to examples/aishell/asr1/local/test.sh diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh similarity index 100% rename from examples/aishell/s1/local/test_hub.sh rename to examples/aishell/asr1/local/test_hub.sh diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh similarity index 100% rename from examples/aishell/s1/local/tlg.sh rename to examples/aishell/asr1/local/tlg.sh diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/asr1/local/train.sh similarity index 100% rename from examples/aishell/s1/local/train.sh rename to examples/aishell/asr1/local/train.sh diff --git a/examples/aishell/s1/path.sh b/examples/aishell/asr1/path.sh similarity index 100% rename from examples/aishell/s1/path.sh rename to examples/aishell/asr1/path.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/asr1/run.sh similarity index 100% rename from examples/aishell/s1/run.sh rename to examples/aishell/asr1/run.sh diff --git a/examples/aishell/s1/utils b/examples/aishell/asr1/utils similarity index 100% rename from examples/aishell/s1/utils rename to examples/aishell/asr1/utils diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 5943cf1d..74441fd0 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,8 +1,9 @@ # ASR -* s0 is for deepspeech2 offline -* s1 is for transformer/conformer/U2 -* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + ## Data | Data Subset | Duration in Seconds | diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/asr0/README.md similarity index 100% rename from examples/librispeech/s0/README.md rename to examples/librispeech/asr0/README.md diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/asr0/conf/augmentation.json similarity index 100% rename from examples/librispeech/s0/conf/augmentation.json rename to examples/librispeech/asr0/conf/augmentation.json diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/librispeech/s0/conf/deepspeech2.yaml rename to examples/librispeech/asr0/conf/deepspeech2.yaml diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/librispeech/s0/conf/deepspeech2_online.yaml rename to examples/librispeech/asr0/conf/deepspeech2_online.yaml diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/asr0/local/data.sh similarity index 100% rename from examples/librispeech/s0/local/data.sh rename to examples/librispeech/asr0/local/data.sh diff --git a/examples/librispeech/s0/local/download_lm_en.sh b/examples/librispeech/asr0/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s0/local/download_lm_en.sh rename to examples/librispeech/asr0/local/download_lm_en.sh diff --git a/examples/librispeech/s0/local/export.sh b/examples/librispeech/asr0/local/export.sh similarity index 100% rename from examples/librispeech/s0/local/export.sh rename to examples/librispeech/asr0/local/export.sh diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/asr0/local/test.sh similarity index 100% rename from examples/librispeech/s0/local/test.sh rename to examples/librispeech/asr0/local/test.sh diff --git a/examples/librispeech/s0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh similarity index 100% rename from examples/librispeech/s0/local/test_hub.sh rename to examples/librispeech/asr0/local/test_hub.sh diff --git a/examples/librispeech/s0/local/train.sh b/examples/librispeech/asr0/local/train.sh similarity index 100% rename from examples/librispeech/s0/local/train.sh rename to examples/librispeech/asr0/local/train.sh diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/asr0/path.sh similarity index 100% rename from examples/librispeech/s0/path.sh rename to examples/librispeech/asr0/path.sh diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/asr0/run.sh similarity index 100% rename from examples/librispeech/s0/run.sh rename to examples/librispeech/asr0/run.sh diff --git a/examples/librispeech/s1/.gitignore b/examples/librispeech/asr1/.gitignore similarity index 100% rename from examples/librispeech/s1/.gitignore rename to examples/librispeech/asr1/.gitignore diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/asr1/README.md similarity index 100% rename from examples/librispeech/s1/README.md rename to examples/librispeech/asr1/README.md diff --git a/examples/librispeech/s1/cmd.sh b/examples/librispeech/asr1/cmd.sh similarity index 100% rename from examples/librispeech/s1/cmd.sh rename to examples/librispeech/asr1/cmd.sh diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/asr1/conf/augmentation.json similarity index 100% rename from examples/librispeech/s1/conf/augmentation.json rename to examples/librispeech/asr1/conf/augmentation.json diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml similarity index 100% rename from examples/librispeech/s1/conf/chunk_conformer.yaml rename to examples/librispeech/asr1/conf/chunk_conformer.yaml diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml similarity index 100% rename from examples/librispeech/s1/conf/chunk_transformer.yaml rename to examples/librispeech/asr1/conf/chunk_transformer.yaml diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml similarity index 100% rename from examples/librispeech/s1/conf/conformer.yaml rename to examples/librispeech/asr1/conf/conformer.yaml diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml similarity index 100% rename from examples/librispeech/s1/conf/preprocess.yaml rename to examples/librispeech/asr1/conf/preprocess.yaml diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml similarity index 100% rename from examples/librispeech/s1/conf/transformer.yaml rename to examples/librispeech/asr1/conf/transformer.yaml diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/asr1/local/align.sh similarity index 100% rename from examples/librispeech/s1/local/align.sh rename to examples/librispeech/asr1/local/align.sh diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/asr1/local/data.sh similarity index 100% rename from examples/librispeech/s1/local/data.sh rename to examples/librispeech/asr1/local/data.sh diff --git a/examples/librispeech/s1/local/download_lm_en.sh b/examples/librispeech/asr1/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s1/local/download_lm_en.sh rename to examples/librispeech/asr1/local/download_lm_en.sh diff --git a/examples/librispeech/s1/local/export.sh b/examples/librispeech/asr1/local/export.sh similarity index 100% rename from examples/librispeech/s1/local/export.sh rename to examples/librispeech/asr1/local/export.sh diff --git a/examples/librispeech/s1/local/test.sh b/examples/librispeech/asr1/local/test.sh similarity index 100% rename from examples/librispeech/s1/local/test.sh rename to examples/librispeech/asr1/local/test.sh diff --git a/examples/librispeech/s1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh similarity index 100% rename from examples/librispeech/s1/local/test_hub.sh rename to examples/librispeech/asr1/local/test_hub.sh diff --git a/examples/librispeech/s1/local/train.sh b/examples/librispeech/asr1/local/train.sh similarity index 100% rename from examples/librispeech/s1/local/train.sh rename to examples/librispeech/asr1/local/train.sh diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/asr1/path.sh similarity index 100% rename from examples/librispeech/s1/path.sh rename to examples/librispeech/asr1/path.sh diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/asr1/run.sh similarity index 100% rename from examples/librispeech/s1/run.sh rename to examples/librispeech/asr1/run.sh diff --git a/examples/librispeech/s1/utils b/examples/librispeech/asr1/utils similarity index 100% rename from examples/librispeech/s1/utils rename to examples/librispeech/asr1/utils diff --git a/examples/librispeech/s2/.gitignore b/examples/librispeech/asr2/.gitignore similarity index 100% rename from examples/librispeech/s2/.gitignore rename to examples/librispeech/asr2/.gitignore diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/asr2/README.md similarity index 100% rename from examples/librispeech/s2/README.md rename to examples/librispeech/asr2/README.md diff --git a/examples/librispeech/s2/cmd.sh b/examples/librispeech/asr2/cmd.sh similarity index 100% rename from examples/librispeech/s2/cmd.sh rename to examples/librispeech/asr2/cmd.sh diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/asr2/conf/augmentation.json similarity index 100% rename from examples/librispeech/s2/conf/augmentation.json rename to examples/librispeech/asr2/conf/augmentation.json diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/asr2/conf/decode/decode.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode.yaml rename to examples/librispeech/asr2/conf/decode/decode.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_att.yaml b/examples/librispeech/asr2/conf/decode/decode_att.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_att.yaml rename to examples/librispeech/asr2/conf/decode/decode_att.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_ctc.yaml b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_ctc.yaml rename to examples/librispeech/asr2/conf/decode/decode_ctc.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_wo_lm.yaml rename to examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml diff --git a/examples/librispeech/s2/conf/fbank.conf b/examples/librispeech/asr2/conf/fbank.conf similarity index 100% rename from examples/librispeech/s2/conf/fbank.conf rename to examples/librispeech/asr2/conf/fbank.conf diff --git a/examples/librispeech/s2/conf/lm/transformer.yaml b/examples/librispeech/asr2/conf/lm/transformer.yaml similarity index 100% rename from examples/librispeech/s2/conf/lm/transformer.yaml rename to examples/librispeech/asr2/conf/lm/transformer.yaml diff --git a/examples/librispeech/s2/conf/pitch.conf b/examples/librispeech/asr2/conf/pitch.conf similarity index 100% rename from examples/librispeech/s2/conf/pitch.conf rename to examples/librispeech/asr2/conf/pitch.conf diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml similarity index 100% rename from examples/librispeech/s2/conf/transformer.yaml rename to examples/librispeech/asr2/conf/transformer.yaml diff --git a/examples/librispeech/s2/local/align.sh b/examples/librispeech/asr2/local/align.sh similarity index 100% rename from examples/librispeech/s2/local/align.sh rename to examples/librispeech/asr2/local/align.sh diff --git a/examples/librispeech/s2/local/cacu_perplexity.sh b/examples/librispeech/asr2/local/cacu_perplexity.sh similarity index 100% rename from examples/librispeech/s2/local/cacu_perplexity.sh rename to examples/librispeech/asr2/local/cacu_perplexity.sh diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/asr2/local/data.sh similarity index 100% rename from examples/librispeech/s2/local/data.sh rename to examples/librispeech/asr2/local/data.sh diff --git a/examples/librispeech/s2/local/data_prep.sh b/examples/librispeech/asr2/local/data_prep.sh similarity index 100% rename from examples/librispeech/s2/local/data_prep.sh rename to examples/librispeech/asr2/local/data_prep.sh diff --git a/examples/librispeech/s2/local/download_lm_en.sh b/examples/librispeech/asr2/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s2/local/download_lm_en.sh rename to examples/librispeech/asr2/local/download_lm_en.sh diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/asr2/local/espnet_json_to_manifest.py similarity index 100% rename from examples/librispeech/s2/local/espnet_json_to_manifest.py rename to examples/librispeech/asr2/local/espnet_json_to_manifest.py diff --git a/examples/librispeech/s2/local/export.sh b/examples/librispeech/asr2/local/export.sh similarity index 100% rename from examples/librispeech/s2/local/export.sh rename to examples/librispeech/asr2/local/export.sh diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/asr2/local/recog.sh similarity index 100% rename from examples/librispeech/s2/local/recog.sh rename to examples/librispeech/asr2/local/recog.sh diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/asr2/local/test.sh similarity index 100% rename from examples/librispeech/s2/local/test.sh rename to examples/librispeech/asr2/local/test.sh diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/asr2/local/train.sh similarity index 100% rename from examples/librispeech/s2/local/train.sh rename to examples/librispeech/asr2/local/train.sh diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/asr2/path.sh similarity index 100% rename from examples/librispeech/s2/path.sh rename to examples/librispeech/asr2/path.sh diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/asr2/run.sh similarity index 100% rename from examples/librispeech/s2/run.sh rename to examples/librispeech/asr2/run.sh diff --git a/examples/librispeech/s2/steps b/examples/librispeech/asr2/steps similarity index 100% rename from examples/librispeech/s2/steps rename to examples/librispeech/asr2/steps diff --git a/examples/librispeech/s2/utils b/examples/librispeech/asr2/utils similarity index 100% rename from examples/librispeech/s2/utils rename to examples/librispeech/asr2/utils diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md index 5664b06b..6d6886da 100644 --- a/examples/ted_en_zh/README.md +++ b/examples/ted_en_zh/README.md @@ -1,3 +1,3 @@ # TED En -> Zh -* t0 for u2 speech translation +* st0 - conformer/transformer speech translation diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/st0/.gitignore similarity index 100% rename from examples/ted_en_zh/t0/.gitignore rename to examples/ted_en_zh/st0/.gitignore diff --git a/examples/ted_en_zh/t0/README.md b/examples/ted_en_zh/st0/README.md similarity index 100% rename from examples/ted_en_zh/t0/README.md rename to examples/ted_en_zh/st0/README.md diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml similarity index 100% rename from examples/ted_en_zh/t0/conf/transformer.yaml rename to examples/ted_en_zh/st0/conf/transformer.yaml diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml similarity index 100% rename from examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml rename to examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh similarity index 100% rename from examples/ted_en_zh/t0/local/data.sh rename to examples/ted_en_zh/st0/local/data.sh diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh similarity index 100% rename from examples/ted_en_zh/t0/local/test.sh rename to examples/ted_en_zh/st0/local/test.sh diff --git a/examples/ted_en_zh/t0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh similarity index 100% rename from examples/ted_en_zh/t0/local/train.sh rename to examples/ted_en_zh/st0/local/train.sh diff --git a/examples/ted_en_zh/t0/path.sh b/examples/ted_en_zh/st0/path.sh similarity index 100% rename from examples/ted_en_zh/t0/path.sh rename to examples/ted_en_zh/st0/path.sh diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/st0/run.sh similarity index 100% rename from examples/ted_en_zh/t0/run.sh rename to examples/ted_en_zh/st0/run.sh diff --git a/examples/timit/README.md b/examples/timit/README.md index 87f1858f..77839874 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -1,3 +1,7 @@ # TIMIT -* asr1 - u2 model with phone unit +asr model with phone unit + +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature \ No newline at end of file diff --git a/examples/tiny/README.md b/examples/tiny/README.md index 6766f59a..f36baae6 100644 --- a/examples/tiny/README.md +++ b/examples/tiny/README.md @@ -1,2 +1,3 @@ -* s0 for deepspeech2 -* s1 for U2 +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature diff --git a/examples/tiny/s0/.gitignore b/examples/tiny/asr0/.gitignore similarity index 100% rename from examples/tiny/s0/.gitignore rename to examples/tiny/asr0/.gitignore diff --git a/examples/tiny/s0/README.md b/examples/tiny/asr0/README.md similarity index 100% rename from examples/tiny/s0/README.md rename to examples/tiny/asr0/README.md diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/asr0/conf/augmentation.json similarity index 100% rename from examples/tiny/s0/conf/augmentation.json rename to examples/tiny/asr0/conf/augmentation.json diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/tiny/s0/conf/deepspeech2.yaml rename to examples/tiny/asr0/conf/deepspeech2.yaml diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/tiny/s0/conf/deepspeech2_online.yaml rename to examples/tiny/asr0/conf/deepspeech2_online.yaml diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/asr0/local/data.sh similarity index 100% rename from examples/tiny/s0/local/data.sh rename to examples/tiny/asr0/local/data.sh diff --git a/examples/tiny/s0/local/download_lm_en.sh b/examples/tiny/asr0/local/download_lm_en.sh similarity index 100% rename from examples/tiny/s0/local/download_lm_en.sh rename to examples/tiny/asr0/local/download_lm_en.sh diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/asr0/local/export.sh similarity index 100% rename from examples/tiny/s0/local/export.sh rename to examples/tiny/asr0/local/export.sh diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/asr0/local/test.sh similarity index 100% rename from examples/tiny/s0/local/test.sh rename to examples/tiny/asr0/local/test.sh diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/asr0/local/train.sh similarity index 100% rename from examples/tiny/s0/local/train.sh rename to examples/tiny/asr0/local/train.sh diff --git a/examples/tiny/s0/path.sh b/examples/tiny/asr0/path.sh similarity index 100% rename from examples/tiny/s0/path.sh rename to examples/tiny/asr0/path.sh diff --git a/examples/tiny/s0/run.sh b/examples/tiny/asr0/run.sh similarity index 100% rename from examples/tiny/s0/run.sh rename to examples/tiny/asr0/run.sh diff --git a/examples/tiny/s1/.gitignore b/examples/tiny/asr1/.gitignore similarity index 100% rename from examples/tiny/s1/.gitignore rename to examples/tiny/asr1/.gitignore diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/asr1/conf/augmentation.json similarity index 100% rename from examples/tiny/s1/conf/augmentation.json rename to examples/tiny/asr1/conf/augmentation.json diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml similarity index 100% rename from examples/tiny/s1/conf/chunk_confermer.yaml rename to examples/tiny/asr1/conf/chunk_confermer.yaml diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml similarity index 100% rename from examples/tiny/s1/conf/chunk_transformer.yaml rename to examples/tiny/asr1/conf/chunk_transformer.yaml diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml similarity index 100% rename from examples/tiny/s1/conf/conformer.yaml rename to examples/tiny/asr1/conf/conformer.yaml diff --git a/examples/tiny/s1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml similarity index 100% rename from examples/tiny/s1/conf/preprocess.yaml rename to examples/tiny/asr1/conf/preprocess.yaml diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml similarity index 100% rename from examples/tiny/s1/conf/transformer.yaml rename to examples/tiny/asr1/conf/transformer.yaml diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/asr1/local/align.sh similarity index 100% rename from examples/tiny/s1/local/align.sh rename to examples/tiny/asr1/local/align.sh diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/asr1/local/data.sh similarity index 100% rename from examples/tiny/s1/local/data.sh rename to examples/tiny/asr1/local/data.sh diff --git a/examples/tiny/s1/local/export.sh b/examples/tiny/asr1/local/export.sh similarity index 100% rename from examples/tiny/s1/local/export.sh rename to examples/tiny/asr1/local/export.sh diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/asr1/local/test.sh similarity index 100% rename from examples/tiny/s1/local/test.sh rename to examples/tiny/asr1/local/test.sh diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/asr1/local/train.sh similarity index 100% rename from examples/tiny/s1/local/train.sh rename to examples/tiny/asr1/local/train.sh diff --git a/examples/tiny/s1/path.sh b/examples/tiny/asr1/path.sh similarity index 100% rename from examples/tiny/s1/path.sh rename to examples/tiny/asr1/path.sh diff --git a/examples/tiny/s1/run.sh b/examples/tiny/asr1/run.sh similarity index 100% rename from examples/tiny/s1/run.sh rename to examples/tiny/asr1/run.sh diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md index 3d93677f..0cb0f354 100644 --- a/examples/wenetspeech/README.md +++ b/examples/wenetspeech/README.md @@ -1,6 +1,8 @@ -# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature -* asr1 - u2 asr model +# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition From 2d808a3c648806ac1664dc77f6c3682e7c44dc3b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 23 Nov 2021 06:48:33 +0000 Subject: [PATCH 23/25] fix urls --- README.md | 4 ++-- demos/metaverse/run.sh | 4 ++-- demos/story_talker/run.sh | 4 ++-- demos/style_fs2/run.sh | 4 ++-- docs/source/released_model.md | 32 ++++++++++++++-------------- docs/source/tts/demo.rst | 28 ++++++++++++------------ examples/aishell3/tts3/README.md | 4 ++-- examples/aishell3/vc0/README.md | 2 +- examples/aishell3/vc1/README.md | 6 +++--- examples/aishell3/voc1/README.md | 2 +- examples/csmsc/tts2/README.md | 6 +++--- examples/csmsc/tts3/README.md | 6 +++--- examples/csmsc/voc1/README.md | 4 ++-- examples/csmsc/voc3/README.md | 8 +++---- examples/ljspeech/tts0/README.md | 4 ++-- examples/ljspeech/tts1/README.md | 4 ++-- examples/ljspeech/tts3/README.md | 4 ++-- examples/ljspeech/voc0/README.md | 2 +- examples/ljspeech/voc1/README.md | 2 +- examples/other/ge2e/README.md | 2 +- examples/vctk/tts3/README.md | 4 ++-- examples/vctk/voc1/README.md | 2 +- tests/chains/speedyspeech/prepare.sh | 4 ++-- 23 files changed, 71 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 2f9d9928..ec2d0f30 100644 --- a/README.md +++ b/README.md @@ -128,9 +128,9 @@ For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC: ```shell cd examples/csmsc/tts3 # download the pretrained models and unaip them -wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip pwg_baker_ckpt_0.4.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip fastspeech2_nosil_baker_ckpt_0.4.zip # source the environment source path.sh diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh index ea7f683c..ba7d7980 100755 --- a/demos/metaverse/run.sh +++ b/demos/metaverse/run.sh @@ -25,9 +25,9 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh index 069ec12e..44259cd3 100755 --- a/demos/story_talker/run.sh +++ b/demos/story_talker/run.sh @@ -19,9 +19,9 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh index f035dd1b..6f6d6068 100755 --- a/demos/style_fs2/run.sh +++ b/demos/style_fs2/run.sh @@ -14,9 +14,9 @@ mkdir -p download if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 78f5c92f..ca04f6a7 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,4 +1,3 @@ - # Released Models ## Speech-to-Text Models @@ -32,27 +31,28 @@ Language Model | Training Data | Token-based | Size | Descriptions ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)||| -TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)||| -SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB| -FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| -FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| -FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| +Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| +TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| +SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| +FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| +FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)||| -Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB| -Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)||| -Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)||| -Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB| +WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| +Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB| +Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| +Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| +Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| +|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: -GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) -GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip) +GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip) +GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip) +GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index f47c0892..4c2f86b1 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -52,7 +52,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -72,7 +72,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -91,7 +91,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -110,7 +110,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -129,7 +129,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -281,7 +281,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -300,7 +300,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -320,7 +320,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -341,7 +341,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -361,7 +361,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -381,7 +381,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -401,7 +401,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -421,7 +421,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -441,7 +441,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index fe4887b9..056f35ba 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -97,7 +97,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_aishell3_ckpt_0.5.zip ``` @@ -202,7 +202,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) FastSpeech2 checkpoint contains files listed below. diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 2f1b37ee..756fbde6 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -86,4 +86,4 @@ In addition, in order to accelerate the convergence of the model, we add `guided CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} ``` ## Pretrained Model -[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). +[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip). diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md index 834942fa..ae53443e 100644 --- a/examples/aishell3/vc1/README.md +++ b/examples/aishell3/vc1/README.md @@ -22,7 +22,7 @@ You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech ## Pretrained GE2E model We use pretrained GE2E model to generate spwaker embedding for each sentence. -Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it. +Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it. ## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. @@ -84,7 +84,7 @@ The training step is very similar to that one of [tts3](https://github.com/Paddl ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_aishell3_ckpt_0.5.zip ``` @@ -115,7 +115,7 @@ ref_audio CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ``` ## Pretrained Model -[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) +[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. (There is no need for `speaker_id_map.txt` here ) diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index d67af726..bc28bba1 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -132,7 +132,7 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 61c4972b..5ebf3cf4 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -90,7 +90,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash unzip pwg_baker_ckpt_0.4.zip ``` @@ -208,9 +208,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip). +Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip). -Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip). +Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip). SpeedySpeech checkpoint contains files listed below. ```text diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 6570d33d..104964c8 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -88,7 +88,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash unzip pwg_baker_ckpt_0.4.zip ``` @@ -199,9 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip). +Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip). -Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip). +Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index b9c8a465..86114a42 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -122,9 +122,9 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). +Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip). -Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip). +Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index a72f60f1..4925b649 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -113,7 +113,7 @@ The length of mel-spectrograms should align with the length of wavs, so we shoul But since we are fine-tuning, we should use the statistics computed during training step. -You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it. +You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it. Assume the path to the dump-dir of training step is `dump`. Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing). @@ -147,11 +147,11 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip). +Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). -Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip). +Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). -Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) +Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) Multi Band MelGAN checkpoint contains files listed below. diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index 09fd0c13..305add20 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -80,6 +80,6 @@ optional arguments: ## Pretrained Models Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. -1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) +1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip) -2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip) +2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip) diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 12e43e2e..8a43ecd9 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -79,7 +79,7 @@ optional arguments: ## Synthesize We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder. -Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. +Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. ```bash unzip waveflow_ljspeech_ckpt_0.3.zip ``` @@ -173,7 +173,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) +Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip) TransformerTTS checkpoint contains files listed below. ```text diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index cda53541..5bdaf4b8 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -87,7 +87,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_ljspeech_ckpt_0.5.zip ``` @@ -191,7 +191,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md index 09856c36..0d4e6c51 100644 --- a/examples/ljspeech/voc0/README.md +++ b/examples/ljspeech/voc0/README.md @@ -48,4 +48,4 @@ Synthesize waveform. 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). +Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 0506d5d8..24f6dbca 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -123,7 +123,7 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) +Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md index d86c8c13..d58ca513 100644 --- a/examples/other/ge2e/README.md +++ b/examples/other/ge2e/README.md @@ -95,7 +95,7 @@ In `${BIN_DIR}/inference.py`: ## Pretrained Model The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. -Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip). +Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip). ## References diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 334372f9..894d6b14 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -90,7 +90,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it. ```bash unzip pwg_vctk_ckpt_0.5.zip ``` @@ -196,7 +196,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 5063b869..8692f010 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -127,7 +127,7 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/tests/chains/speedyspeech/prepare.sh b/tests/chains/speedyspeech/prepare.sh index fb6ef285..1ddcd677 100755 --- a/tests/chains/speedyspeech/prepare.sh +++ b/tests/chains/speedyspeech/prepare.sh @@ -32,7 +32,7 @@ trainer_list=$(func_parser_value "${lines[14]}") # MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer'] if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data - wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip (cd ./pretrain_models && unzip pwg_baker_ckpt_0.4.zip) # download data rm -rf ./train_data/mini_BZNSYP @@ -40,7 +40,7 @@ if [ ${MODE} = "lite_train_infer" ];then cd ./train_data/ && tar xzf mini_BZNSYP.tar.gz cd ../ elif [ ${MODE} = "whole_train_infer" ];then - wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip + wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip (cd ./pretrain_models && unzip speedyspeech_nosil_baker_ckpt_0.5.zip && unzip pwg_baker_ckpt_0.4.zip) rm -rf ./train_data/processed_BZNSYP From 4537e900efe16c94d41864c0aa9eef7e29717041 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 23 Nov 2021 15:02:01 +0800 Subject: [PATCH 24/25] Update README.md --- examples/aishell3/vc0/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 756fbde6..376d4a33 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -41,7 +41,7 @@ We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so th We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. -You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. +You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. ```bash if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then From 8aebfeac812e4324ef3cd43ba99b7027f7e5eb6d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 23 Nov 2021 07:44:55 +0000 Subject: [PATCH 25/25] fix the prc-commit --- examples/dataset/aishell/aishell.py | 2 +- examples/dataset/ted_en_zh/ted_en_zh.py | 3 +-- examples/dataset/thchs30/thchs30.py | 2 +- .../timit/timit_kaldi_standard_split.py | 2 +- examples/librispeech/asr1/README.md | 2 +- examples/timit/README.md | 2 +- examples/wenetspeech/README.md | 2 +- examples/wenetspeech/asr1/RESULTS.md | 2 +- .../wenetspeech/asr1/local/extract_meta.py | 25 +++++++++++++------ .../wenetspeech/asr1/local/process_opus.py | 22 +++++++++++----- paddlespeech/s2t/exps/deepspeech2/model.py | 14 +++++------ paddlespeech/s2t/exps/u2/model.py | 7 +----- 12 files changed, 50 insertions(+), 35 deletions(-) diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index 95ed0408..7431fc08 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -82,7 +82,7 @@ def create_manifest(data_dir, manifest_path_prefix): # if no transcription for audio then skipped if audio_id not in transcript_dict: continue - + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index a8cbb837..9a3ba3b3 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -73,7 +73,6 @@ def create_manifest(data_dir, manifest_path_prefix): audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) - translation_str = " ".join(translation.split()) trancription_str = " ".join(trancription.split()) json_lines.append( @@ -82,7 +81,7 @@ def create_manifest(data_dir, manifest_path_prefix): 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': [translation_str, trancription_str], + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 2ec4ddab..cdfc0a75 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -124,7 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, - 'utt2spk', spk, + 'utt2spk': spk, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': word_text, # charactor diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py index 26aa76c7..473fc856 100644 --- a/examples/dataset/timit/timit_kaldi_standard_split.py +++ b/examples/dataset/timit/timit_kaldi_standard_split.py @@ -22,9 +22,9 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile -from pathlib import Path parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/README.md index 20255db8..73f0863e 100644 --- a/examples/librispeech/asr1/README.md +++ b/examples/librispeech/asr1/README.md @@ -24,4 +24,4 @@ | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | \ No newline at end of file +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | diff --git a/examples/timit/README.md b/examples/timit/README.md index 77839874..51fcfd57 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -4,4 +4,4 @@ asr model with phone unit * asr0 - deepspeech2 Streaming/Non-Streaming * asr1 - transformer/conformer Streaming/Non-Streaming -* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature \ No newline at end of file +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md index 0cb0f354..cbd01eb8 100644 --- a/examples/wenetspeech/README.md +++ b/examples/wenetspeech/README.md @@ -55,4 +55,4 @@ As shown in the following table, we provide 3 training subsets, namely `S`, `M` |-----------------|-------|--------------|-----------------------------------------------------------------------------------------| | DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training | | TEST\_NET | 23 | Internet | Match test | -| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | \ No newline at end of file +| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index 5aff041f..5c2b8143 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -21,4 +21,4 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py index 4de0b7d4..0e1b2727 100644 --- a/examples/wenetspeech/asr1/local/extract_meta.py +++ b/examples/wenetspeech/asr1/local/extract_meta.py @@ -1,6 +1,18 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) # Mobvoi Inc(Author: Di Wu, Binbin Zhang) - # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,11 +24,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import sys -import os import argparse import json +import os +import sys def get_args(): @@ -85,13 +96,13 @@ def meta_analysis(input_json, output_dir): else: utt2text.write(f'{sid}\t{text}\n') segments.write( - f'{sid}\t{aid}\t{start_time}\t{end_time}\n' - ) + f'{sid}\t{aid}\t{start_time}\t{end_time}\n') utt2dur.write(f'{sid}\t{dur}\n') segment_sub_names = " ".join(segment_subsets) utt2subsets.write( f'{sid}\t{segment_sub_names}\n') + def main(): args = get_args() @@ -99,4 +110,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py index 603e0082..f1b9287e 100644 --- a/examples/wenetspeech/asr1/local/process_opus.py +++ b/examples/wenetspeech/asr1/local/process_opus.py @@ -1,5 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) - # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,14 +23,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # process_opus.py: segmentation and downsampling of opus audio - # usage: python3 process_opus.py wav.scp segments output_wav.scp +import os +import sys from pydub import AudioSegment -import sys -import os def read_file(wav_scp, segments): @@ -86,4 +96,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 177d710b..e827414d 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -409,7 +409,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): @paddle.no_grad() def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( batch_size=self.config.decoding.batch_size, @@ -438,7 +438,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.report() def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): @@ -512,7 +512,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): x_len_list = np.split(x_len_batch, batch_size, axis=0) for x, x_len in zip(x_list, x_len_list): - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.times.start() x_len = x_len[0] assert (chunk_size <= x_len) @@ -547,7 +547,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): probs_chunk_list = [] probs_chunk_lens_list = [] - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model preprocessing time self.autolog.times.stamp() @@ -606,7 +606,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): [output_probs, output_probs_padding], axis=1) output_probs_list.append(output_probs) output_lens_list.append(output_lens) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model inference time self.autolog.times.stamp() # record the post processing time @@ -641,12 +641,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): audio_len_handle.reshape(x_len.shape) audio_len_handle.copy_from_cpu(x_len) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.times.start() # record the prefix processing time self.autolog.times.stamp() self.predictor.run() - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model inference time self.autolog.times.stamp() # record the post processing time diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 9f5448cc..27bc47d2 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -24,15 +24,10 @@ import jsonlines import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer import TextFeaturizer -from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataloader import BatchDataLoader -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.io.sampler import SortagradBatchSampler -from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -215,7 +210,7 @@ class U2Trainer(Trainer): msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" msg += f" {k.split(',')[1]}" if len( - k.split(',')) == 2 else f"" + k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1