From bf3eb4981890a19415865257ee5b5f959bb1ff76 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Thu, 29 Sep 2022 09:47:05 +0800 Subject: [PATCH] add different tess feature config --- examples/tess/README.md | 34 +++++++++++++++++++ .../cls0/conf/panns_logmelspectrogram.yaml | 32 +++++++++++++++++ .../tess/cls0/conf/panns_melspectrogram.yaml | 32 +++++++++++++++++ examples/tess/cls0/conf/panns_mfcc.yaml | 33 ++++++++++++++++++ .../tess/cls0/conf/panns_spectrogram.yaml | 28 +++++++++++++++ examples/tess/cls0/local/train.py | 24 ++++++++----- 6 files changed, 174 insertions(+), 9 deletions(-) create mode 100644 examples/tess/README.md create mode 100644 examples/tess/cls0/conf/panns_logmelspectrogram.yaml create mode 100644 examples/tess/cls0/conf/panns_melspectrogram.yaml create mode 100644 examples/tess/cls0/conf/panns_mfcc.yaml create mode 100644 examples/tess/cls0/conf/panns_spectrogram.yaml diff --git a/examples/tess/README.md b/examples/tess/README.md new file mode 100644 index 000000000..f56ab8d40 --- /dev/null +++ b/examples/tess/README.md @@ -0,0 +1,34 @@ +# 背景 + +模型任务与模型间接请参见 examples/esc50, 本目录是为了校验和测试 paddle.audio 的feature, backend等相关模块而建立. + +## 数据集 + +[TESS: Toronto emotional speech set](https://tspace.library.utoronto.ca/handle/1807/24487) 是一个包含有 200 个目标词的时长为 2 ~ 3 秒的音频,七种情绪的数据集。由两个女演员录制(24岁和64岁),其中情绪分别是愤怒,恶心,害怕,高兴,惊喜,伤心,平淡. + +## 模型指标 + +根据 `TESS` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 2 epoch 训练和评估,dev准确率如下: + +|Model|feat_type|Acc| +|--|--|--| +|CNN14| mfcc | 0.8304 | +|CNN14| logmelspectrogram | 0.9893 | +|CNN14| spectrogram| 0.1304 | +|CNN14| melspectrogram| 0.1339 | + +因为是功能验证,所以只config中训练了 2 个epoch. +log_melspectrogram feature 在迭代 3 个epoch后, acc可以达到0.9983%. +mfcc feature 在迭代3个epoch后, acc可以达到0.9983%. +spectrogram feature 在迭代11个epoch后,acc可达0.95%. +melspectrogram feature 在迭代17个epoch后,acc可到0.9375%. + +### 模型训练 + +启动训练: +```shell +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_mfcc.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_logmelspectrogram.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_melspectrogram.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_pectrogram.yaml +``` diff --git a/examples/tess/cls0/conf/panns_logmelspectrogram.yaml b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml new file mode 100644 index 000000000..ba953c235 --- /dev/null +++ b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml @@ -0,0 +1,32 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'logmelspectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'logmelspectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_logmelspectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_melspectrogram.yaml b/examples/tess/cls0/conf/panns_melspectrogram.yaml new file mode 100644 index 000000000..a5d53d3d6 --- /dev/null +++ b/examples/tess/cls0/conf/panns_melspectrogram.yaml @@ -0,0 +1,32 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'melspectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'melspectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_melspectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_mfcc.yaml b/examples/tess/cls0/conf/panns_mfcc.yaml new file mode 100644 index 000000000..08b1387d9 --- /dev/null +++ b/examples/tess/cls0/conf/panns_mfcc.yaml @@ -0,0 +1,33 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'mfcc' + dev: + mode: 'dev' + split: 1 + feat_type: 'mfcc' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mfcc: 64 + n_mels: 64 + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_mfcc' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_spectrogram.yaml b/examples/tess/cls0/conf/panns_spectrogram.yaml new file mode 100644 index 000000000..a4a6a7bc7 --- /dev/null +++ b/examples/tess/cls0/conf/panns_spectrogram.yaml @@ -0,0 +1,28 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'spectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'spectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 126 + hop_length: 320 + window: 'hann' + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_spectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index c1f0e7e43..3e6062414 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -22,6 +22,7 @@ from paddleaudio.utils import Timer from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import + # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--cfg_path", type=str, required=True) @@ -61,12 +62,17 @@ if __name__ == "__main__": model_conf = config['model'] data_conf = config['data'] feat_conf = config['feature'] + feat_type = data_conf['train']['feat_type'] training_conf = config['training'] # Dataset + + # set audio backend, make sure paddleaudio >= 1.0.2 installed. + paddle.audio.backends.set_backend('soundfile') + ds_class = dynamic_import(data_conf['dataset']) - train_ds = ds_class(**data_conf['train']) - dev_ds = ds_class(**data_conf['dev']) + train_ds = ds_class(**data_conf['train'], **feat_conf) + dev_ds = ds_class(**data_conf['dev'], **feat_conf) train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=training_conf['batch_size'], @@ -101,7 +107,7 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - feats, labels, length = batch # feats(N, length, n_mels) + feats, labels, length = batch # feats-->(N, length, n_mels) logits = model(feats) @@ -129,7 +135,7 @@ if __name__ == "__main__": avg_loss /= training_conf['log_freq'] avg_acc = num_corrects / num_samples - print_msg = 'Epoch={}/{}, Step={}/{}'.format( + print_msg = feat_type + ' Epoch={}/{}, Step={}/{}'.format( epoch, training_conf['epochs'], batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) @@ -153,23 +159,23 @@ if __name__ == "__main__": dev_ds, batch_sampler=dev_sampler, num_workers=training_conf['num_workers'], - return_list=True, ) + return_list=True, + use_buffer_reader=True, + collate_fn=_collate_features) model.eval() num_corrects = 0 num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - waveforms, labels = batch - feats = feature_extractor(waveforms) - + feats, labels, length = batch logits = model(feats) preds = paddle.argmax(logits, axis=1) num_corrects += (preds == labels).numpy().sum() num_samples += feats.shape[0] - print_msg = '[Evaluation result]' + print_msg = '[Evaluation result] ' + str(feat_type) print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples) logger.eval(print_msg)