diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml index 3a9d42aa5..a0668b27f 100644 --- a/examples/esc50/cls0/conf/panns.yaml +++ b/examples/esc50/cls0/conf/panns.yaml @@ -1,5 +1,5 @@ data: - dataset: 'paddleaudio.datasets:ESC50' + dataset: 'paddle.audio.datasets:ESC50' num_classes: 50 train: mode: 'train' @@ -33,4 +33,4 @@ training: predicting: audio_file: '/audio/dog.wav' top_k: 10 - checkpoint: './checkpoint/epoch_50/model.pdparams' \ No newline at end of file + checkpoint: './checkpoint/epoch_50/model.pdparams' diff --git a/examples/tess/README.md b/examples/tess/README.md new file mode 100644 index 000000000..0439841ca --- /dev/null +++ b/examples/tess/README.md @@ -0,0 +1,34 @@ +# 背景 + +TESS音频情绪分类任务。 +从而校验和测试 paddle.audio 的feature, backend等相关模块。 + +本实验采用了PaddleSpeech提供了PANNs的CNN14的预训练模型进行finetune: +- CNN14: 该模型主要包含12个卷积层和2个全连接层,模型参数的数量为 79.6M,embbedding维度是 2048。 + +`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后,模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。 + +## 数据集 + +[TESS: Toronto emotional speech set](https://tspace.library.utoronto.ca/handle/1807/24487) 是一个包含有 200 个目标词的时长为 2 ~ 3 秒的音频,七种情绪的数据集。由两个女演员录制(24岁和64岁),其中情绪分别是愤怒,恶心,害怕,高兴,惊喜,伤心,平淡。 + +## 模型指标 + +根据 `TESS` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 训练和评估,dev准确率如下: + +|Model|feat_type|Acc| note | +|--|--|--| -- | +|CNN14| mfcc | 0.9929 |3 epoch | +|CNN14| logmelspectrogram | 0.9983 | 3 epoch | +|CNN14| spectrogram| 0.95 | 11 epoch | +|CNN14| melspectrogram| 0.9375 | 17 epoch | + +### 模型训练 + +启动训练: +```shell +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_mfcc.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_logmelspectrogram.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_melspectrogram.yaml +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_pectrogram.yaml +``` diff --git a/examples/tess/cls0/conf/panns_logmelspectrogram.yaml b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml new file mode 100644 index 000000000..c48e517ea --- /dev/null +++ b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml @@ -0,0 +1,32 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'logmelspectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'logmelspectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 5 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_logmelspectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_melspectrogram.yaml b/examples/tess/cls0/conf/panns_melspectrogram.yaml new file mode 100644 index 000000000..66aa4a717 --- /dev/null +++ b/examples/tess/cls0/conf/panns_melspectrogram.yaml @@ -0,0 +1,32 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'melspectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'melspectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 10 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_melspectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_mfcc.yaml b/examples/tess/cls0/conf/panns_mfcc.yaml new file mode 100644 index 000000000..6800e3abc --- /dev/null +++ b/examples/tess/cls0/conf/panns_mfcc.yaml @@ -0,0 +1,33 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'mfcc' + dev: + mode: 'dev' + split: 1 + feat_type: 'mfcc' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mfcc: 64 + n_mels: 64 + +training: + epochs: 5 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_mfcc' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/conf/panns_spectrogram.yaml b/examples/tess/cls0/conf/panns_spectrogram.yaml new file mode 100644 index 000000000..8d88f41c4 --- /dev/null +++ b/examples/tess/cls0/conf/panns_spectrogram.yaml @@ -0,0 +1,28 @@ +data: + dataset: 'paddle.audio.datasets:TESS' + num_classes: 7 + train: + mode: 'train' + split: 1 + feat_type: 'spectrogram' + dev: + mode: 'dev' + split: 1 + feat_type: 'spectrogram' + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + n_fft: 126 + hop_length: 320 + window: 'hann' + +training: + epochs: 10 + learning_rate: 0.0005 + num_workers: 2 + batch_size: 128 + checkpoint_dir: './checkpoint_spectrogram' + save_freq: 1 + log_freq: 1 diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py new file mode 100644 index 000000000..3e6062414 --- /dev/null +++ b/examples/tess/cls0/local/train.py @@ -0,0 +1,190 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle +import yaml + +from paddleaudio.utils import logger +from paddleaudio.utils import Timer +from paddlespeech.cls.models import SoundClassifier +from paddlespeech.utils.dynamic_import import dynamic_import + + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--cfg_path", type=str, required=True) +args = parser.parse_args() +# yapf: enable + +def _collate_features(batch): + # (feat, label) + # (( n_mels, length), label) + feats = [] + labels = [] + lengths = [] + for sample in batch: + feats.append(paddle.transpose(sample[0], perm=[1,0])) + lengths.append(sample[0].shape[1]) + labels.append(sample[1]) + + max_length = max(lengths) + for i in range(len(feats)): + feats[i] = paddle.nn.functional.pad( + feats[i], [0, max_length - feats[i].shape[0], 0, 0], + data_format='NLC') + + return paddle.stack(feats), paddle.to_tensor( + labels), paddle.to_tensor(lengths) + +if __name__ == "__main__": + nranks = paddle.distributed.get_world_size() + if paddle.distributed.get_world_size() > 1: + paddle.distributed.init_parallel_env() + local_rank = paddle.distributed.get_rank() + + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + feat_type = data_conf['train']['feat_type'] + training_conf = config['training'] + + # Dataset + + # set audio backend, make sure paddleaudio >= 1.0.2 installed. + paddle.audio.backends.set_backend('soundfile') + + ds_class = dynamic_import(data_conf['dataset']) + train_ds = ds_class(**data_conf['train'], **feat_conf) + dev_ds = ds_class(**data_conf['dev'], **feat_conf) + train_sampler = paddle.io.DistributedBatchSampler( + train_ds, + batch_size=training_conf['batch_size'], + shuffle=True, + drop_last=False) + train_loader = paddle.io.DataLoader( + train_ds, + batch_sampler=train_sampler, + num_workers=training_conf['num_workers'], + return_list=True, + use_buffer_reader=True, + collate_fn=_collate_features) + + # Model + backbone_class = dynamic_import(model_conf['backbone']) + backbone = backbone_class(pretrained=True, extract_embedding=True) + model = SoundClassifier(backbone, num_class=data_conf['num_classes']) + model = paddle.DataParallel(model) + optimizer = paddle.optimizer.Adam( + learning_rate=training_conf['learning_rate'], + parameters=model.parameters()) + criterion = paddle.nn.loss.CrossEntropyLoss() + + steps_per_epoch = len(train_sampler) + timer = Timer(steps_per_epoch * training_conf['epochs']) + timer.start() + + for epoch in range(1, training_conf['epochs'] + 1): + model.train() + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + for batch_idx, batch in enumerate(train_loader): + feats, labels, length = batch # feats-->(N, length, n_mels) + + logits = model(feats) + + loss = criterion(logits, labels) + loss.backward() + optimizer.step() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + optimizer.clear_grad() + + # Calculate loss + avg_loss += loss.numpy()[0] + + # Calculate metrics + preds = paddle.argmax(logits, axis=1) + num_corrects += (preds == labels).numpy().sum() + num_samples += feats.shape[0] + + timer.count() + + if (batch_idx + 1 + ) % training_conf['log_freq'] == 0 and local_rank == 0: + lr = optimizer.get_lr() + avg_loss /= training_conf['log_freq'] + avg_acc = num_corrects / num_samples + + print_msg = feat_type + ' Epoch={}/{}, Step={}/{}'.format( + epoch, training_conf['epochs'], batch_idx + 1, + steps_per_epoch) + print_msg += ' loss={:.4f}'.format(avg_loss) + print_msg += ' acc={:.4f}'.format(avg_acc) + print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( + lr, timer.timing, timer.eta) + logger.train(print_msg) + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + + if epoch % training_conf[ + 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: + dev_sampler = paddle.io.BatchSampler( + dev_ds, + batch_size=training_conf['batch_size'], + shuffle=False, + drop_last=False) + dev_loader = paddle.io.DataLoader( + dev_ds, + batch_sampler=dev_sampler, + num_workers=training_conf['num_workers'], + return_list=True, + use_buffer_reader=True, + collate_fn=_collate_features) + + model.eval() + num_corrects = 0 + num_samples = 0 + with logger.processing('Evaluation on validation dataset'): + for batch_idx, batch in enumerate(dev_loader): + feats, labels, length = batch + logits = model(feats) + + preds = paddle.argmax(logits, axis=1) + num_corrects += (preds == labels).numpy().sum() + num_samples += feats.shape[0] + + print_msg = '[Evaluation result] ' + str(feat_type) + print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples) + + logger.eval(print_msg) + + # Save model + save_dir = os.path.join(training_conf['checkpoint_dir'], + 'epoch_{}'.format(epoch)) + logger.info('Saving model checkpoint to {}'.format(save_dir)) + paddle.save(model.state_dict(), + os.path.join(save_dir, 'model.pdparams')) + paddle.save(optimizer.state_dict(), + os.path.join(save_dir, 'model.pdopt')) diff --git a/examples/tess/cls0/local/train.sh b/examples/tess/cls0/local/train.sh new file mode 100755 index 000000000..953c56bf8 --- /dev/null +++ b/examples/tess/cls0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +ngpu=$1 +cfg_path=$2 + +if [ ${ngpu} -gt 0 ]; then + python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \ + --cfg_path ${cfg_path} +else + python3 local/train.py \ + --cfg_path ${cfg_path} +fi diff --git a/examples/tess/cls0/path.sh b/examples/tess/cls0/path.sh new file mode 100644 index 000000000..3eff28e48 --- /dev/null +++ b/examples/tess/cls0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=panns +export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL} \ No newline at end of file diff --git a/examples/tess/cls0/run.sh b/examples/tess/cls0/run.sh new file mode 100755 index 000000000..0e407b40e --- /dev/null +++ b/examples/tess/cls0/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -e +source path.sh + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + +stage=$1 +stop_stage=100 + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cfg_path=$2 + ./local/train.sh ${ngpu} ${cfg_path} || exit -1 + exit 0 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + cfg_path=$2 + ./local/infer.sh ${cfg_path} || exit -1 + exit 0 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ckpt=$2 + output_dir=$3 + ./local/export.sh ${ckpt} ${output_dir} || exit -1 + exit 0 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + infer_device=$2 + graph_dir=$3 + audio_file=$4 + ./local/static_model_infer.sh ${infer_device} ${graph_dir} ${audio_file} || exit -1 + exit 0 +fi diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index 9258ab516..ab942b2a3 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -17,9 +17,9 @@ import os import paddle import yaml -from paddleaudio.features import LogMelSpectrogram +from paddle.audio.features import LogMelSpectrogram from paddleaudio.utils import logger -from paddlesaudio.utils import Timer +from paddleaudio.utils import Timer from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import