diff --git a/docs/tutorial/cls/cls_tutorial.ipynb b/docs/tutorial/cls/cls_tutorial.ipynb index 9b8bfc119..56b488adc 100644 --- a/docs/tutorial/cls/cls_tutorial.ipynb +++ b/docs/tutorial/cls/cls_tutorial.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "\"Fork\n", "\n", @@ -32,9 +30,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%%HTML\n", @@ -45,9 +41,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 2. 音频和特征提取" ] @@ -55,9 +49,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 环境准备:安装paddlespeech和paddleaudio\n", @@ -67,9 +59,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import warnings\n", @@ -82,9 +72,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "\n", "\n", @@ -98,9 +86,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 获取示例音频\n", @@ -111,9 +97,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio import load\n", @@ -130,9 +114,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "!paddlespeech cls --input ./dog.wav" @@ -140,9 +122,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 2.2 音频特征提取\n", "\n", @@ -162,21 +142,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import paddle\n", "import numpy as np\n", "\n", + "data, sr = load(file='./dog.wav', sr=32000, mono=True, dtype='float32')\n", "x = paddle.to_tensor(data)\n", "n_fft = 1024\n", "win_length = 1024\n", - "hop_length = 512\n", + "hop_length = 320\n", "\n", "# [D, T]\n", - "spectrogram = paddle.signal.stft(x, n_fft=1024, win_length=1024, hop_length=512, onesided=True) \n", + "spectrogram = paddle.signal.stft(x, n_fft=n_fft, win_length=win_length, hop_length=hop_length, onesided=True) \n", "print('spectrogram.shape: {}'.format(spectrogram.shape))\n", "print('spectrogram.dtype: {}'.format(spectrogram.dtype))\n", "\n", @@ -190,9 +169,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.2.2 LogFBank\n", "\n", @@ -220,13 +197,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.features import LogMelSpectrogram\n", "\n", + "f_min=50.0\n", + "f_max=14000.0\n", + "n_mels=64\n", + "\n", "# - sr: 音频文件的采样率。\n", "# - n_fft: FFT样本点个数。\n", "# - hop_length: 音频帧之间的间隔。\n", @@ -239,7 +218,9 @@ " hop_length=hop_length, \n", " win_length=win_length, \n", " window='hann', \n", - " n_mels=64)\n", + " f_min=f_min,\n", + " f_max=f_max,\n", + " n_mels=n_mels)\n", "\n", "x = paddle.to_tensor(data).unsqueeze(0) # [B, L]\n", "log_fbank = feature_extractor2(x) # [B, D, T]\n", @@ -253,9 +234,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 2.3 声音分类方法\n", "\n", @@ -272,9 +251,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.3.2 深度学习方法\n", "传统机器学习方法可以捕捉声音特征的差异(例如男声和女声的声音在音高上往往差异较大)并实现分类任务。\n", @@ -288,9 +265,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.3.3 Pretrain + Finetune\n", "\n", @@ -315,9 +290,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 3. 实践:环境声音分类\n", "\n", @@ -361,22 +334,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.datasets import ESC50\n", "\n", - "train_ds = ESC50(mode='train')\n", - "dev_ds = ESC50(mode='dev')" + "train_ds = ESC50(mode='train', sample_rate=sr)\n", + "dev_ds = ESC50(mode='dev', sample_rate=sr)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 3.1.2 特征提取\n", "通过下列代码,用 `paddleaudio.features.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: " @@ -385,19 +354,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "feature_extractor = LogMelSpectrogram(sr=44100, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)" + "feature_extractor = LogMelSpectrogram(\n", + " sr=sr, \n", + " n_fft=n_fft, \n", + " hop_length=hop_length, \n", + " win_length=win_length, \n", + " window='hann', \n", + " f_min=f_min,\n", + " f_max=f_max,\n", + " n_mels=n_mels)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.2 模型\n", "\n", @@ -409,9 +382,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddlespeech.cls.models import cnn14\n", @@ -420,9 +391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 3.2.2 构建分类模型\n", "\n", @@ -432,9 +401,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import paddle.nn as nn\n", @@ -461,18 +428,14 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.3 Finetune" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "1. 创建 DataLoader " ] @@ -480,9 +443,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "batch_size = 16\n", @@ -492,9 +453,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "2. 定义优化器和 Loss" ] @@ -502,9 +461,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters())\n", @@ -513,19 +470,15 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "3. 启动模型训练 " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.utils import logger\n", @@ -603,9 +556,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.4 音频预测\n", "\n", @@ -615,16 +566,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "top_k = 10\n", "wav_file = './dog.wav'\n", "\n", - "waveform, sr = load(wav_file)\n", - "feature_extractor = LogMelSpectrogram(sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)\n", + "waveform, _ = load(wav_file, sr)\n", "feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))\n", "feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]\n", "print(feats.shape)\n", @@ -635,16 +583,14 @@ "sorted_indices = probs[0].argsort()\n", "\n", "msg = f'[{wav_file}]\\n'\n", - "for idx in sorted_indices[-top_k:]:\n", + "for idx in sorted_indices[-1:-top_k-1:-1]:\n", " msg += f'{ESC50.label_list[idx]}: {probs[0][idx]:.5f}\\n'\n", "print(msg)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 4. 作业\n", "1. 使用开发模式安装 [PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) \n", @@ -653,6 +599,7 @@ "1. 在 [MusicSpeech](http://marsyas.info/downloads/datasets.html) 数据集上完成 music/speech 二分类。 \n", "2. 在 [GTZAN Genre Collection](http://marsyas.info/downloads/datasets.html) 音乐分类数据集上利用 PANNs 预训练模型实现音乐类别十分类。\n", "\n", + "关于如何自定义分类数据集,请参考文档 [PaddleSpeech/docs/source/cls/custom_dataset.md](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/cls/custom_dataset.md)\n", "\n", "# 5. 关注 PaddleSpeech\n", "\n", @@ -681,9 +628,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "py37", "language": "python", - "name": "py35-paddle1.2.0" + "name": "py37" }, "language_info": { "codemirror_mode": { @@ -695,7 +642,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.7" } }, "nbformat": 4,