From 5819770914e9c9865cc775d7773f25851d391f51 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Wed, 28 Sep 2022 00:01:52 +0800 Subject: [PATCH 1/4] test tess dataset&& esc50 dataset by paddle2.4 --- examples/esc50/cls0/conf/panns.yaml | 4 +- examples/tess/cls0/local/train.py | 184 +++++++++++++++++++++++++++ examples/tess/cls0/local/train.sh | 12 ++ examples/tess/cls0/path.sh | 13 ++ examples/tess/cls0/run.sh | 35 +++++ paddlespeech/cls/exps/panns/train.py | 4 +- 6 files changed, 248 insertions(+), 4 deletions(-) create mode 100644 examples/tess/cls0/local/train.py create mode 100755 examples/tess/cls0/local/train.sh create mode 100644 examples/tess/cls0/path.sh create mode 100755 examples/tess/cls0/run.sh diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml index 3a9d42aa5..a0668b27f 100644 --- a/examples/esc50/cls0/conf/panns.yaml +++ b/examples/esc50/cls0/conf/panns.yaml @@ -1,5 +1,5 @@ data: - dataset: 'paddleaudio.datasets:ESC50' + dataset: 'paddle.audio.datasets:ESC50' num_classes: 50 train: mode: 'train' @@ -33,4 +33,4 @@ training: predicting: audio_file: '/audio/dog.wav' top_k: 10 - checkpoint: './checkpoint/epoch_50/model.pdparams' \ No newline at end of file + checkpoint: './checkpoint/epoch_50/model.pdparams' diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py new file mode 100644 index 000000000..c1f0e7e43 --- /dev/null +++ b/examples/tess/cls0/local/train.py @@ -0,0 +1,184 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle +import yaml + +from paddleaudio.utils import logger +from paddleaudio.utils import Timer +from paddlespeech.cls.models import SoundClassifier +from paddlespeech.utils.dynamic_import import dynamic_import + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--cfg_path", type=str, required=True) +args = parser.parse_args() +# yapf: enable + +def _collate_features(batch): + # (feat, label) + # (( n_mels, length), label) + feats = [] + labels = [] + lengths = [] + for sample in batch: + feats.append(paddle.transpose(sample[0], perm=[1,0])) + lengths.append(sample[0].shape[1]) + labels.append(sample[1]) + + max_length = max(lengths) + for i in range(len(feats)): + feats[i] = paddle.nn.functional.pad( + feats[i], [0, max_length - feats[i].shape[0], 0, 0], + data_format='NLC') + + return paddle.stack(feats), paddle.to_tensor( + labels), paddle.to_tensor(lengths) + +if __name__ == "__main__": + nranks = paddle.distributed.get_world_size() + if paddle.distributed.get_world_size() > 1: + paddle.distributed.init_parallel_env() + local_rank = paddle.distributed.get_rank() + + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + training_conf = config['training'] + + # Dataset + ds_class = dynamic_import(data_conf['dataset']) + train_ds = ds_class(**data_conf['train']) + dev_ds = ds_class(**data_conf['dev']) + train_sampler = paddle.io.DistributedBatchSampler( + train_ds, + batch_size=training_conf['batch_size'], + shuffle=True, + drop_last=False) + train_loader = paddle.io.DataLoader( + train_ds, + batch_sampler=train_sampler, + num_workers=training_conf['num_workers'], + return_list=True, + use_buffer_reader=True, + collate_fn=_collate_features) + + # Model + backbone_class = dynamic_import(model_conf['backbone']) + backbone = backbone_class(pretrained=True, extract_embedding=True) + model = SoundClassifier(backbone, num_class=data_conf['num_classes']) + model = paddle.DataParallel(model) + optimizer = paddle.optimizer.Adam( + learning_rate=training_conf['learning_rate'], + parameters=model.parameters()) + criterion = paddle.nn.loss.CrossEntropyLoss() + + steps_per_epoch = len(train_sampler) + timer = Timer(steps_per_epoch * training_conf['epochs']) + timer.start() + + for epoch in range(1, training_conf['epochs'] + 1): + model.train() + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + for batch_idx, batch in enumerate(train_loader): + feats, labels, length = batch # feats(N, length, n_mels) + + logits = model(feats) + + loss = criterion(logits, labels) + loss.backward() + optimizer.step() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + optimizer.clear_grad() + + # Calculate loss + avg_loss += loss.numpy()[0] + + # Calculate metrics + preds = paddle.argmax(logits, axis=1) + num_corrects += (preds == labels).numpy().sum() + num_samples += feats.shape[0] + + timer.count() + + if (batch_idx + 1 + ) % training_conf['log_freq'] == 0 and local_rank == 0: + lr = optimizer.get_lr() + avg_loss /= training_conf['log_freq'] + avg_acc = num_corrects / num_samples + + print_msg = 'Epoch={}/{}, Step={}/{}'.format( + epoch, training_conf['epochs'], batch_idx + 1, + steps_per_epoch) + print_msg += ' loss={:.4f}'.format(avg_loss) + print_msg += ' acc={:.4f}'.format(avg_acc) + print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( + lr, timer.timing, timer.eta) + logger.train(print_msg) + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + + if epoch % training_conf[ + 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: + dev_sampler = paddle.io.BatchSampler( + dev_ds, + batch_size=training_conf['batch_size'], + shuffle=False, + drop_last=False) + dev_loader = paddle.io.DataLoader( + dev_ds, + batch_sampler=dev_sampler, + num_workers=training_conf['num_workers'], + return_list=True, ) + + model.eval() + num_corrects = 0 + num_samples = 0 + with logger.processing('Evaluation on validation dataset'): + for batch_idx, batch in enumerate(dev_loader): + waveforms, labels = batch + feats = feature_extractor(waveforms) + + logits = model(feats) + + preds = paddle.argmax(logits, axis=1) + num_corrects += (preds == labels).numpy().sum() + num_samples += feats.shape[0] + + print_msg = '[Evaluation result]' + print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples) + + logger.eval(print_msg) + + # Save model + save_dir = os.path.join(training_conf['checkpoint_dir'], + 'epoch_{}'.format(epoch)) + logger.info('Saving model checkpoint to {}'.format(save_dir)) + paddle.save(model.state_dict(), + os.path.join(save_dir, 'model.pdparams')) + paddle.save(optimizer.state_dict(), + os.path.join(save_dir, 'model.pdopt')) diff --git a/examples/tess/cls0/local/train.sh b/examples/tess/cls0/local/train.sh new file mode 100755 index 000000000..953c56bf8 --- /dev/null +++ b/examples/tess/cls0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +ngpu=$1 +cfg_path=$2 + +if [ ${ngpu} -gt 0 ]; then + python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \ + --cfg_path ${cfg_path} +else + python3 local/train.py \ + --cfg_path ${cfg_path} +fi diff --git a/examples/tess/cls0/path.sh b/examples/tess/cls0/path.sh new file mode 100644 index 000000000..3eff28e48 --- /dev/null +++ b/examples/tess/cls0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=panns +export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL} \ No newline at end of file diff --git a/examples/tess/cls0/run.sh b/examples/tess/cls0/run.sh new file mode 100755 index 000000000..0e407b40e --- /dev/null +++ b/examples/tess/cls0/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -e +source path.sh + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + +stage=$1 +stop_stage=100 + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cfg_path=$2 + ./local/train.sh ${ngpu} ${cfg_path} || exit -1 + exit 0 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + cfg_path=$2 + ./local/infer.sh ${cfg_path} || exit -1 + exit 0 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ckpt=$2 + output_dir=$3 + ./local/export.sh ${ckpt} ${output_dir} || exit -1 + exit 0 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + infer_device=$2 + graph_dir=$3 + audio_file=$4 + ./local/static_model_infer.sh ${infer_device} ${graph_dir} ${audio_file} || exit -1 + exit 0 +fi diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index 9258ab516..ab942b2a3 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -17,9 +17,9 @@ import os import paddle import yaml -from paddleaudio.features import LogMelSpectrogram +from paddle.audio.features import LogMelSpectrogram from paddleaudio.utils import logger -from paddlesaudio.utils import Timer +from paddleaudio.utils import Timer from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import From bf3eb4981890a19415865257ee5b5f959bb1ff76 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Thu, 29 Sep 2022 09:47:05 +0800 Subject: [PATCH 2/4] add different tess feature config --- examples/tess/README.md | 34 +++++++++++++++++++ .../cls0/conf/panns_logmelspectrogram.yaml | 32 +++++++++++++++++ .../tess/cls0/conf/panns_melspectrogram.yaml | 32 +++++++++++++++++ examples/tess/cls0/conf/panns_mfcc.yaml | 33 ++++++++++++++++++ .../tess/cls0/conf/panns_spectrogram.yaml | 28 +++++++++++++++ examples/tess/cls0/local/train.py | 24 ++++++++----- 6 files changed, 174 insertions(+), 9 deletions(-) create mode 100644 examples/tess/README.md create mode 100644 examples/tess/cls0/conf/panns_logmelspectrogram.yaml create mode 100644 examples/tess/cls0/conf/panns_melspectrogram.yaml create mode 100644 examples/tess/cls0/conf/panns_mfcc.yaml create mode 100644 examples/tess/cls0/conf/panns_spectrogram.yaml diff --git a/examples/tess/README.md b/examples/tess/README.md new file mode 100644 index 000000000..f56ab8d40 --- /dev/null +++ b/examples/tess/README.md @@ -0,0 +1,34 @@ +# 背景 + +模型任务与模型间接请参见 examples/esc50, 本目录是为了校验和测试 paddle.audio 的feature, backend等相关模块而建立. + +## 数据集 + +[TESS: Toronto emotional speech set](https://tspace.library.utoronto.ca/handle/1807/24487) 是一个包含有 200 个目标词的时长为 2 ~ 3 秒的音频,七种情绪的数据集。由两个女演员录制(24岁和64岁),其中情绪分别是愤怒,恶心,害怕,高兴,惊喜,伤心,平淡. + +## 模型指标 + +根据 `TESS` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 2 epoch 训练和评估,dev准确率如下: + +|Model|feat_type|Acc| +|--|--|--| +|CNN14| mfcc | 0.8304 | +|CNN14| logmelspectrogram | 0.9893 | +|CNN14| spectrogram| 0.1304 | +|CNN14| melspectrogram| 0.1339 | + +因为是功能验证,所以只config中训练了 2 个epoch. +log_melspectrogram feature 在迭代 3 个epoch后, acc可以达到0.9983%. +mfcc feature 在迭代3个epoch后, acc可以达到0.9983%. +spectrogram feature 在迭代11个epoch后,acc可达0.95%. +melspectrogram feature 在迭代17个epoch后,acc可到0.9375%. + +### 模型训练 + +启动训练: +```shell +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_mfcc.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_logmelspectrogram.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_melspectrogram.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_pectrogram.yaml +``` diff --git a/examples/tess/cls0/conf/panns_logmelspectrogram.yaml b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml new file mode 100644 index 000000000..ba953c235 --- /dev/null +++ b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml @@ -0,0 +1,32 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'logmelspectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'logmelspectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_logmelspectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_melspectrogram.yaml b/examples/tess/cls0/conf/panns_melspectrogram.yaml new file mode 100644 index 000000000..a5d53d3d6 --- /dev/null +++ b/examples/tess/cls0/conf/panns_melspectrogram.yaml @@ -0,0 +1,32 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'melspectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'melspectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_melspectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_mfcc.yaml b/examples/tess/cls0/conf/panns_mfcc.yaml new file mode 100644 index 000000000..08b1387d9 --- /dev/null +++ b/examples/tess/cls0/conf/panns_mfcc.yaml @@ -0,0 +1,33 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'mfcc' + dev: + mode: 'dev' + split: 1 + feat_type: 'mfcc' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mfcc: 64 + n_mels: 64 + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_mfcc' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_spectrogram.yaml b/examples/tess/cls0/conf/panns_spectrogram.yaml new file mode 100644 index 000000000..a4a6a7bc7 --- /dev/null +++ b/examples/tess/cls0/conf/panns_spectrogram.yaml @@ -0,0 +1,28 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'spectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'spectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 126 + hop_length: 320 + window: 'hann' + +training: + epochs: 2 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_spectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index c1f0e7e43..3e6062414 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -22,6 +22,7 @@ from paddleaudio.utils import Timer from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import + # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--cfg_path", type=str, required=True) @@ -61,12 +62,17 @@ if __name__ == "__main__": model_conf = config['model'] data_conf = config['data'] feat_conf = config['feature'] + feat_type = data_conf['train']['feat_type'] training_conf = config['training'] # Dataset + + # set audio backend, make sure paddleaudio >= 1.0.2 installed. + paddle.audio.backends.set_backend('soundfile') + ds_class = dynamic_import(data_conf['dataset']) - train_ds = ds_class(**data_conf['train']) - dev_ds = ds_class(**data_conf['dev']) + train_ds = ds_class(**data_conf['train'], **feat_conf) + dev_ds = ds_class(**data_conf['dev'], **feat_conf) train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=training_conf['batch_size'], @@ -101,7 +107,7 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - feats, labels, length = batch # feats(N, length, n_mels) + feats, labels, length = batch # feats-->(N, length, n_mels) logits = model(feats) @@ -129,7 +135,7 @@ if __name__ == "__main__": avg_loss /= training_conf['log_freq'] avg_acc = num_corrects / num_samples - print_msg = 'Epoch={}/{}, Step={}/{}'.format( + print_msg = feat_type + ' Epoch={}/{}, Step={}/{}'.format( epoch, training_conf['epochs'], batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) @@ -153,23 +159,23 @@ if __name__ == "__main__": dev_ds, batch_sampler=dev_sampler, num_workers=training_conf['num_workers'], - return_list=True, ) + return_list=True, + use_buffer_reader=True, + collate_fn=_collate_features) model.eval() num_corrects = 0 num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - waveforms, labels = batch - feats = feature_extractor(waveforms) - + feats, labels, length = batch logits = model(feats) preds = paddle.argmax(logits, axis=1) num_corrects += (preds == labels).numpy().sum() num_samples += feats.shape[0] - print_msg = '[Evaluation result]' + print_msg = '[Evaluation result] ' + str(feat_type) print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples) logger.eval(print_msg) From 382503fcb33603d61e16bfd8dca8d795362e0f7c Mon Sep 17 00:00:00 2001 From: YangZhou Date: Thu, 29 Sep 2022 12:46:27 +0800 Subject: [PATCH 3/4] update READMe --- examples/tess/README.md | 28 +++++++++---------- .../cls0/conf/panns_logmelspectrogram.yaml | 2 +- .../tess/cls0/conf/panns_melspectrogram.yaml | 2 +- examples/tess/cls0/conf/panns_mfcc.yaml | 2 +- .../tess/cls0/conf/panns_spectrogram.yaml | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/tess/README.md b/examples/tess/README.md index f56ab8d40..ef180f84d 100644 --- a/examples/tess/README.md +++ b/examples/tess/README.md @@ -1,6 +1,12 @@ # 背景 -模型任务与模型间接请参见 examples/esc50, 本目录是为了校验和测试 paddle.audio 的feature, backend等相关模块而建立. +TESS音频情绪分类任务. +从而校验和测试 paddle.audio 的feature, backend等相关模块. + +本实验采用了PaddleSpeech提供了PANNs的CNN14的预训练模型进行finetune: +- CNN14: 该模型主要包含12个卷积层和2个全连接层,模型参数的数量为 79.6M,embbedding维度是 2048。 + +`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后,模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。 ## 数据集 @@ -8,20 +14,14 @@ ## 模型指标 -根据 `TESS` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 2 epoch 训练和评估,dev准确率如下: - -|Model|feat_type|Acc| -|--|--|--| -|CNN14| mfcc | 0.8304 | -|CNN14| logmelspectrogram | 0.9893 | -|CNN14| spectrogram| 0.1304 | -|CNN14| melspectrogram| 0.1339 | +根据 `TESS` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 训练和评估,dev准确率如下: -因为是功能验证,所以只config中训练了 2 个epoch. -log_melspectrogram feature 在迭代 3 个epoch后, acc可以达到0.9983%. -mfcc feature 在迭代3个epoch后, acc可以达到0.9983%. -spectrogram feature 在迭代11个epoch后,acc可达0.95%. -melspectrogram feature 在迭代17个epoch后,acc可到0.9375%. +|Model|feat_type|Acc| note | +|--|--|--| -- | +|CNN14| mfcc | 0.9929 |3 epoch | +|CNN14| logmelspectrogram | 0.9983 | 3 epoch | +|CNN14| spectrogram| 0.95 | 11 epoch | +|CNN14| melspectrogram| 0.9375 | 17 epoch | ### 模型训练 diff --git a/examples/tess/cls0/conf/panns_logmelspectrogram.yaml b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml index ba953c235..c48e517ea 100644 --- a/examples/tess/cls0/conf/panns_logmelspectrogram.yaml +++ b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml @@ -23,7 +23,7 @@ feature: n_mels: 64 training: - epochs: 2 + epochs: 5 learning_rate: 0.0005 num_workers: 2 batch_size: 128 diff --git a/examples/tess/cls0/conf/panns_melspectrogram.yaml b/examples/tess/cls0/conf/panns_melspectrogram.yaml index a5d53d3d6..66aa4a717 100644 --- a/examples/tess/cls0/conf/panns_melspectrogram.yaml +++ b/examples/tess/cls0/conf/panns_melspectrogram.yaml @@ -23,7 +23,7 @@ feature: n_mels: 64 training: - epochs: 2 + epochs: 10 learning_rate: 0.0005 num_workers: 2 batch_size: 128 diff --git a/examples/tess/cls0/conf/panns_mfcc.yaml b/examples/tess/cls0/conf/panns_mfcc.yaml index 08b1387d9..6800e3abc 100644 --- a/examples/tess/cls0/conf/panns_mfcc.yaml +++ b/examples/tess/cls0/conf/panns_mfcc.yaml @@ -24,7 +24,7 @@ feature: n_mels: 64 training: - epochs: 2 + epochs: 5 learning_rate: 0.0005 num_workers: 2 batch_size: 128 diff --git a/examples/tess/cls0/conf/panns_spectrogram.yaml b/examples/tess/cls0/conf/panns_spectrogram.yaml index a4a6a7bc7..8d88f41c4 100644 --- a/examples/tess/cls0/conf/panns_spectrogram.yaml +++ b/examples/tess/cls0/conf/panns_spectrogram.yaml @@ -19,7 +19,7 @@ feature: window: 'hann' training: - epochs: 2 + epochs: 10 learning_rate: 0.0005 num_workers: 2 batch_size: 128 From a9f918842e722b641d60714f78e08ca8669c66e5 Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Thu, 29 Sep 2022 14:32:03 +0800 Subject: [PATCH 4/4] fix punc in readme --- examples/tess/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tess/README.md b/examples/tess/README.md index ef180f84d..0439841ca 100644 --- a/examples/tess/README.md +++ b/examples/tess/README.md @@ -1,7 +1,7 @@ # 背景 -TESS音频情绪分类任务. -从而校验和测试 paddle.audio 的feature, backend等相关模块. +TESS音频情绪分类任务。 +从而校验和测试 paddle.audio 的feature, backend等相关模块。 本实验采用了PaddleSpeech提供了PANNs的CNN14的预训练模型进行finetune: - CNN14: 该模型主要包含12个卷积层和2个全连接层,模型参数的数量为 79.6M,embbedding维度是 2048。 @@ -10,7 +10,7 @@ TESS音频情绪分类任务. ## 数据集 -[TESS: Toronto emotional speech set](https://tspace.library.utoronto.ca/handle/1807/24487) 是一个包含有 200 个目标词的时长为 2 ~ 3 秒的音频,七种情绪的数据集。由两个女演员录制(24岁和64岁),其中情绪分别是愤怒,恶心,害怕,高兴,惊喜,伤心,平淡. +[TESS: Toronto emotional speech set](https://tspace.library.utoronto.ca/handle/1807/24487) 是一个包含有 200 个目标词的时长为 2 ~ 3 秒的音频,七种情绪的数据集。由两个女演员录制(24岁和64岁),其中情绪分别是愤怒,恶心,害怕,高兴,惊喜,伤心,平淡。 ## 模型指标