diff --git a/.notebook/audio_feature.ipynb b/.notebook/audio_feature.ipynb new file mode 100644 index 000000000..5febb0aef --- /dev/null +++ b/.notebook/audio_feature.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "id": "matched-camera", + "metadata": {}, + "outputs": [], + "source": [ + "from nnAudio import Spectrogram\n", + "from scipy.io import wavfile\n", + "import torch\n", + "import soundfile as sf\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "middle-salem", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16000\n", + "[43 75 69 ... 7 6 3]\n", + "(83792,)\n", + "int16\n", + "sampling rate = 16000. Please make sure the sampling rate is correct in order toget a valid freq range\n", + "STFT kernels created, time used = 0.2142 seconds\n", + "tensor([[[[-4.0940e+03, 1.2600e+04],\n", + " [ 8.5108e+03, -5.4930e+03],\n", + " [-3.3631e+03, -1.7904e+03],\n", + " ...,\n", + " [ 8.2279e+03, -9.3340e+03],\n", + " [-3.1990e+03, 2.0969e+03],\n", + " [-1.2669e+03, 4.4488e+03]],\n", + "\n", + " [[ 3.4886e+03, -9.9620e+03],\n", + " [-4.5364e+03, 4.1907e+02],\n", + " [ 2.5074e+03, 7.1339e+03],\n", + " ...,\n", + " [-5.4819e+03, 3.9258e+01],\n", + " [ 4.7221e+03, 6.5887e+01],\n", + " [ 9.6492e+02, -3.4386e+03]],\n", + "\n", + " [[-3.4947e+03, 9.2981e+03],\n", + " [-7.5164e+03, 8.1856e+02],\n", + " [-5.3766e+03, -9.0889e+03],\n", + " ...,\n", + " [ 1.4317e+03, 5.7447e+03],\n", + " [-3.1178e+03, 3.0740e+03],\n", + " [-3.4351e+03, 5.6900e+02]],\n", + "\n", + " ...,\n", + "\n", + " [[ 6.7112e+01, -4.5737e+00],\n", + " [-9.6295e+00, 3.5554e+01],\n", + " [ 1.8527e+00, -1.0491e+01],\n", + " ...,\n", + " [-1.1157e+01, 3.4423e+00],\n", + " [ 3.1193e+00, -4.4388e+00],\n", + " [-8.8242e+00, 8.0324e+00]],\n", + "\n", + " [[-6.5080e+01, 2.9543e+00],\n", + " [ 3.9992e+01, -1.3836e+01],\n", + " [-9.2803e+00, 1.0318e+01],\n", + " ...,\n", + " [ 4.2928e+00, 9.2397e+00],\n", + " [ 3.6642e+00, 9.4680e+00],\n", + " [ 4.8932e+00, -2.5199e+01]],\n", + "\n", + " [[ 4.7264e+01, -1.0721e+00],\n", + " [-6.0516e+00, -1.4589e+01],\n", + " [ 1.3127e+01, 1.4995e+00],\n", + " ...,\n", + " [ 1.7333e+01, -1.4380e+01],\n", + " [-3.6046e+00, -6.1019e+00],\n", + " [ 1.3321e+01, 2.3184e+01]]]])\n" + ] + } + ], + "source": [ + "sr, song = wavfile.read('./BAC009S0764W0124.wav') # Loading your audio\n", + "print(sr)\n", + "print(song)\n", + "print(song.shape)\n", + "print(song.dtype)\n", + "x = song\n", + "x = torch.tensor(x).float() # casting the array into a PyTorch Tensor\n", + "\n", + "spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, hop_length=512,\n", + " window='hann', freq_scale='linear', center=True, pad_mode='reflect',\n", + " fmin=50,fmax=8000, sr=sr) # Initializing the model\n", + "\n", + "spec = spec_layer(x) # Feed-forward your waveform to get the spectrogram\n", + "print(spec)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "finished-sterling", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16000\n", + "[43 75 69 ... 7 6 3]\n", + "(83792,)\n", + "int16\n", + "True\n", + "sampling rate = 16000. Please make sure the sampling rate is correct in order toget a valid freq range\n", + "STFT kernels created, time used = 0.2495 seconds\n", + "torch.Size([1, 1025, 164, 2])\n", + "tensor([[[[-4.0940e+03, 1.2600e+04],\n", + " [ 8.5108e+03, -5.4930e+03],\n", + " [-3.3631e+03, -1.7904e+03],\n", + " ...,\n", + " [ 8.2279e+03, -9.3340e+03],\n", + " [-3.1990e+03, 2.0969e+03],\n", + " [-1.2669e+03, 4.4488e+03]],\n", + "\n", + " [[ 3.4886e+03, -9.9620e+03],\n", + " [-4.5364e+03, 4.1907e+02],\n", + " [ 2.5074e+03, 7.1339e+03],\n", + " ...,\n", + " [-5.4819e+03, 3.9258e+01],\n", + " [ 4.7221e+03, 6.5887e+01],\n", + " [ 9.6492e+02, -3.4386e+03]],\n", + "\n", + " [[-3.4947e+03, 9.2981e+03],\n", + " [-7.5164e+03, 8.1856e+02],\n", + " [-5.3766e+03, -9.0889e+03],\n", + " ...,\n", + " [ 1.4317e+03, 5.7447e+03],\n", + " [-3.1178e+03, 3.0740e+03],\n", + " [-3.4351e+03, 5.6900e+02]],\n", + "\n", + " ...,\n", + "\n", + " [[ 6.7112e+01, -4.5737e+00],\n", + " [-9.6295e+00, 3.5554e+01],\n", + " [ 1.8527e+00, -1.0491e+01],\n", + " ...,\n", + " [-1.1157e+01, 3.4423e+00],\n", + " [ 3.1193e+00, -4.4388e+00],\n", + " [-8.8242e+00, 8.0324e+00]],\n", + "\n", + " [[-6.5080e+01, 2.9543e+00],\n", + " [ 3.9992e+01, -1.3836e+01],\n", + " [-9.2803e+00, 1.0318e+01],\n", + " ...,\n", + " [ 4.2928e+00, 9.2397e+00],\n", + " [ 3.6642e+00, 9.4680e+00],\n", + " [ 4.8932e+00, -2.5199e+01]],\n", + "\n", + " [[ 4.7264e+01, -1.0721e+00],\n", + " [-6.0516e+00, -1.4589e+01],\n", + " [ 1.3127e+01, 1.4995e+00],\n", + " ...,\n", + " [ 1.7333e+01, -1.4380e+01],\n", + " [-3.6046e+00, -6.1019e+00],\n", + " [ 1.3321e+01, 2.3184e+01]]]])\n", + "True\n" + ] + } + ], + "source": [ + "wav, sr = sf.read('./BAC009S0764W0124.wav', dtype='int16')\n", + "print(sr)\n", + "print(wav)\n", + "print(wav.shape)\n", + "print(wav.dtype)\n", + "print(np.allclose(wav, song))\n", + "\n", + "x = wav\n", + "x = torch.tensor(x).float() # casting the array into a PyTorch Tensor\n", + "\n", + "spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, hop_length=512,\n", + " window='hann', freq_scale='linear', center=True, pad_mode='reflect',\n", + " fmin=50,fmax=8000, sr=sr) # Initializing the model\n", + "\n", + "wav_spec = spec_layer(x) # Feed-forward your waveform to get the spectrogram\n", + "print(wav_spec.shape)\n", + "print(wav_spec)\n", + "print(np.allclose(wav_spec, spec))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "running-technology", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}