Merge pull request #2482 from SmileGoat/check_ec50

[Example] add tess example, testing paddle.audio features,backend.(paddle >= 2.4)
pull/2399/head
TianYuan 3 years ago committed by GitHub
commit 62fe3d444d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,5 @@
data:
dataset: 'paddleaudio.datasets:ESC50'
dataset: 'paddle.audio.datasets:ESC50'
num_classes: 50
train:
mode: 'train'

@ -0,0 +1,34 @@
# 背景
TESS音频情绪分类任务。
从而校验和测试 paddle.audio 的feature, backend等相关模块。
本实验采用了PaddleSpeech提供了PANNs的CNN14的预训练模型进行finetune
- CNN14: 该模型主要包含12个卷积层和2个全连接层模型参数的数量为 79.6Membbedding维度是 2048。
`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。
## 数据集
[TESS: Toronto emotional speech set](https://tspace.library.utoronto.ca/handle/1807/24487) 是一个包含有 200 个目标词的时长为 2 ~ 3 秒的音频,七种情绪的数据集。由两个女演员录制(24岁和64岁),其中情绪分别是愤怒,恶心,害怕,高兴,惊喜,伤心,平淡。
## 模型指标
根据 `TESS` 提供的fold信息对数据集进行 5-fold 的 fine-tune 训练和评估dev准确率如下
|Model|feat_type|Acc| note |
|--|--|--| -- |
|CNN14| mfcc | 0.9929 |3 epoch |
|CNN14| logmelspectrogram | 0.9983 | 3 epoch |
|CNN14| spectrogram| 0.95 | 11 epoch |
|CNN14| melspectrogram| 0.9375 | 17 epoch |
### 模型训练
启动训练:
```shell
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_mfcc.yaml
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_logmelspectrogram.yaml
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_melspectrogram.yaml
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_pectrogram.yaml
```

@ -0,0 +1,32 @@
data:
dataset: 'paddle.audio.datasets:TESS'
num_classes: 7
train:
mode: 'train'
split: 1
feat_type: 'logmelspectrogram'
dev:
mode: 'dev'
split: 1
feat_type: 'logmelspectrogram'
model:
backbone: 'paddlespeech.cls.models:cnn14'
feature:
n_fft: 1024
hop_length: 320
window: 'hann'
win_length: 1024
f_min: 50.0
f_max: 14000.0
n_mels: 64
training:
epochs: 5
learning_rate: 0.0005
num_workers: 2
batch_size: 128
checkpoint_dir: './checkpoint_logmelspectrogram'
save_freq: 1
log_freq: 1

@ -0,0 +1,32 @@
data:
dataset: 'paddle.audio.datasets:TESS'
num_classes: 7
train:
mode: 'train'
split: 1
feat_type: 'melspectrogram'
dev:
mode: 'dev'
split: 1
feat_type: 'melspectrogram'
model:
backbone: 'paddlespeech.cls.models:cnn14'
feature:
n_fft: 1024
hop_length: 320
window: 'hann'
win_length: 1024
f_min: 50.0
f_max: 14000.0
n_mels: 64
training:
epochs: 10
learning_rate: 0.0005
num_workers: 2
batch_size: 128
checkpoint_dir: './checkpoint_melspectrogram'
save_freq: 1
log_freq: 1

@ -0,0 +1,33 @@
data:
dataset: 'paddle.audio.datasets:TESS'
num_classes: 7
train:
mode: 'train'
split: 1
feat_type: 'mfcc'
dev:
mode: 'dev'
split: 1
feat_type: 'mfcc'
model:
backbone: 'paddlespeech.cls.models:cnn14'
feature:
n_fft: 1024
hop_length: 320
window: 'hann'
win_length: 1024
f_min: 50.0
f_max: 14000.0
n_mfcc: 64
n_mels: 64
training:
epochs: 5
learning_rate: 0.0005
num_workers: 2
batch_size: 128
checkpoint_dir: './checkpoint_mfcc'
save_freq: 1
log_freq: 1

@ -0,0 +1,28 @@
data:
dataset: 'paddle.audio.datasets:TESS'
num_classes: 7
train:
mode: 'train'
split: 1
feat_type: 'spectrogram'
dev:
mode: 'dev'
split: 1
feat_type: 'spectrogram'
model:
backbone: 'paddlespeech.cls.models:cnn14'
feature:
n_fft: 126
hop_length: 320
window: 'hann'
training:
epochs: 10
learning_rate: 0.0005
num_workers: 2
batch_size: 128
checkpoint_dir: './checkpoint_spectrogram'
save_freq: 1
log_freq: 1

@ -0,0 +1,190 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import yaml
from paddleaudio.utils import logger
from paddleaudio.utils import Timer
from paddlespeech.cls.models import SoundClassifier
from paddlespeech.utils.dynamic_import import dynamic_import
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--cfg_path", type=str, required=True)
args = parser.parse_args()
# yapf: enable
def _collate_features(batch):
# (feat, label)
# (( n_mels, length), label)
feats = []
labels = []
lengths = []
for sample in batch:
feats.append(paddle.transpose(sample[0], perm=[1,0]))
lengths.append(sample[0].shape[1])
labels.append(sample[1])
max_length = max(lengths)
for i in range(len(feats)):
feats[i] = paddle.nn.functional.pad(
feats[i], [0, max_length - feats[i].shape[0], 0, 0],
data_format='NLC')
return paddle.stack(feats), paddle.to_tensor(
labels), paddle.to_tensor(lengths)
if __name__ == "__main__":
nranks = paddle.distributed.get_world_size()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
local_rank = paddle.distributed.get_rank()
args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
with open(args.cfg_path, 'r') as f:
config = yaml.safe_load(f)
model_conf = config['model']
data_conf = config['data']
feat_conf = config['feature']
feat_type = data_conf['train']['feat_type']
training_conf = config['training']
# Dataset
# set audio backend, make sure paddleaudio >= 1.0.2 installed.
paddle.audio.backends.set_backend('soundfile')
ds_class = dynamic_import(data_conf['dataset'])
train_ds = ds_class(**data_conf['train'], **feat_conf)
dev_ds = ds_class(**data_conf['dev'], **feat_conf)
train_sampler = paddle.io.DistributedBatchSampler(
train_ds,
batch_size=training_conf['batch_size'],
shuffle=True,
drop_last=False)
train_loader = paddle.io.DataLoader(
train_ds,
batch_sampler=train_sampler,
num_workers=training_conf['num_workers'],
return_list=True,
use_buffer_reader=True,
collate_fn=_collate_features)
# Model
backbone_class = dynamic_import(model_conf['backbone'])
backbone = backbone_class(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=data_conf['num_classes'])
model = paddle.DataParallel(model)
optimizer = paddle.optimizer.Adam(
learning_rate=training_conf['learning_rate'],
parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
steps_per_epoch = len(train_sampler)
timer = Timer(steps_per_epoch * training_conf['epochs'])
timer.start()
for epoch in range(1, training_conf['epochs'] + 1):
model.train()
avg_loss = 0
num_corrects = 0
num_samples = 0
for batch_idx, batch in enumerate(train_loader):
feats, labels, length = batch # feats-->(N, length, n_mels)
logits = model(feats)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
optimizer._learning_rate.step()
optimizer.clear_grad()
# Calculate loss
avg_loss += loss.numpy()[0]
# Calculate metrics
preds = paddle.argmax(logits, axis=1)
num_corrects += (preds == labels).numpy().sum()
num_samples += feats.shape[0]
timer.count()
if (batch_idx + 1
) % training_conf['log_freq'] == 0 and local_rank == 0:
lr = optimizer.get_lr()
avg_loss /= training_conf['log_freq']
avg_acc = num_corrects / num_samples
print_msg = feat_type + ' Epoch={}/{}, Step={}/{}'.format(
epoch, training_conf['epochs'], batch_idx + 1,
steps_per_epoch)
print_msg += ' loss={:.4f}'.format(avg_loss)
print_msg += ' acc={:.4f}'.format(avg_acc)
print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
lr, timer.timing, timer.eta)
logger.train(print_msg)
avg_loss = 0
num_corrects = 0
num_samples = 0
if epoch % training_conf[
'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
dev_sampler = paddle.io.BatchSampler(
dev_ds,
batch_size=training_conf['batch_size'],
shuffle=False,
drop_last=False)
dev_loader = paddle.io.DataLoader(
dev_ds,
batch_sampler=dev_sampler,
num_workers=training_conf['num_workers'],
return_list=True,
use_buffer_reader=True,
collate_fn=_collate_features)
model.eval()
num_corrects = 0
num_samples = 0
with logger.processing('Evaluation on validation dataset'):
for batch_idx, batch in enumerate(dev_loader):
feats, labels, length = batch
logits = model(feats)
preds = paddle.argmax(logits, axis=1)
num_corrects += (preds == labels).numpy().sum()
num_samples += feats.shape[0]
print_msg = '[Evaluation result] ' + str(feat_type)
print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
logger.eval(print_msg)
# Save model
save_dir = os.path.join(training_conf['checkpoint_dir'],
'epoch_{}'.format(epoch))
logger.info('Saving model checkpoint to {}'.format(save_dir))
paddle.save(model.state_dict(),
os.path.join(save_dir, 'model.pdparams'))
paddle.save(optimizer.state_dict(),
os.path.join(save_dir, 'model.pdopt'))

@ -0,0 +1,12 @@
#!/bin/bash
ngpu=$1
cfg_path=$2
if [ ${ngpu} -gt 0 ]; then
python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \
--cfg_path ${cfg_path}
else
python3 local/train.py \
--cfg_path ${cfg_path}
fi

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=panns
export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}

@ -0,0 +1,35 @@
#!/bin/bash
set -e
source path.sh
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
stage=$1
stop_stage=100
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cfg_path=$2
./local/train.sh ${ngpu} ${cfg_path} || exit -1
exit 0
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cfg_path=$2
./local/infer.sh ${cfg_path} || exit -1
exit 0
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
ckpt=$2
output_dir=$3
./local/export.sh ${ckpt} ${output_dir} || exit -1
exit 0
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
infer_device=$2
graph_dir=$3
audio_file=$4
./local/static_model_infer.sh ${infer_device} ${graph_dir} ${audio_file} || exit -1
exit 0
fi

@ -17,9 +17,9 @@ import os
import paddle
import yaml
from paddleaudio.features import LogMelSpectrogram
from paddle.audio.features import LogMelSpectrogram
from paddleaudio.utils import logger
from paddlesaudio.utils import Timer
from paddleaudio.utils import Timer
from paddlespeech.cls.models import SoundClassifier
from paddlespeech.utils.dynamic_import import dynamic_import

Loading…
Cancel
Save