You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
225 lines
7.3 KiB
225 lines
7.3 KiB
3 years ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 13,
|
||
|
"id": "matched-camera",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from nnAudio import Spectrogram\n",
|
||
|
"from scipy.io import wavfile\n",
|
||
|
"import torch\n",
|
||
|
"import soundfile as sf\n",
|
||
|
"import numpy as np"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"id": "middle-salem",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"16000\n",
|
||
|
"[43 75 69 ... 7 6 3]\n",
|
||
|
"(83792,)\n",
|
||
|
"int16\n",
|
||
|
"sampling rate = 16000. Please make sure the sampling rate is correct in order toget a valid freq range\n",
|
||
|
"STFT kernels created, time used = 0.2142 seconds\n",
|
||
|
"tensor([[[[-4.0940e+03, 1.2600e+04],\n",
|
||
|
" [ 8.5108e+03, -5.4930e+03],\n",
|
||
|
" [-3.3631e+03, -1.7904e+03],\n",
|
||
|
" ...,\n",
|
||
|
" [ 8.2279e+03, -9.3340e+03],\n",
|
||
|
" [-3.1990e+03, 2.0969e+03],\n",
|
||
|
" [-1.2669e+03, 4.4488e+03]],\n",
|
||
|
"\n",
|
||
|
" [[ 3.4886e+03, -9.9620e+03],\n",
|
||
|
" [-4.5364e+03, 4.1907e+02],\n",
|
||
|
" [ 2.5074e+03, 7.1339e+03],\n",
|
||
|
" ...,\n",
|
||
|
" [-5.4819e+03, 3.9258e+01],\n",
|
||
|
" [ 4.7221e+03, 6.5887e+01],\n",
|
||
|
" [ 9.6492e+02, -3.4386e+03]],\n",
|
||
|
"\n",
|
||
|
" [[-3.4947e+03, 9.2981e+03],\n",
|
||
|
" [-7.5164e+03, 8.1856e+02],\n",
|
||
|
" [-5.3766e+03, -9.0889e+03],\n",
|
||
|
" ...,\n",
|
||
|
" [ 1.4317e+03, 5.7447e+03],\n",
|
||
|
" [-3.1178e+03, 3.0740e+03],\n",
|
||
|
" [-3.4351e+03, 5.6900e+02]],\n",
|
||
|
"\n",
|
||
|
" ...,\n",
|
||
|
"\n",
|
||
|
" [[ 6.7112e+01, -4.5737e+00],\n",
|
||
|
" [-9.6295e+00, 3.5554e+01],\n",
|
||
|
" [ 1.8527e+00, -1.0491e+01],\n",
|
||
|
" ...,\n",
|
||
|
" [-1.1157e+01, 3.4423e+00],\n",
|
||
|
" [ 3.1193e+00, -4.4388e+00],\n",
|
||
|
" [-8.8242e+00, 8.0324e+00]],\n",
|
||
|
"\n",
|
||
|
" [[-6.5080e+01, 2.9543e+00],\n",
|
||
|
" [ 3.9992e+01, -1.3836e+01],\n",
|
||
|
" [-9.2803e+00, 1.0318e+01],\n",
|
||
|
" ...,\n",
|
||
|
" [ 4.2928e+00, 9.2397e+00],\n",
|
||
|
" [ 3.6642e+00, 9.4680e+00],\n",
|
||
|
" [ 4.8932e+00, -2.5199e+01]],\n",
|
||
|
"\n",
|
||
|
" [[ 4.7264e+01, -1.0721e+00],\n",
|
||
|
" [-6.0516e+00, -1.4589e+01],\n",
|
||
|
" [ 1.3127e+01, 1.4995e+00],\n",
|
||
|
" ...,\n",
|
||
|
" [ 1.7333e+01, -1.4380e+01],\n",
|
||
|
" [-3.6046e+00, -6.1019e+00],\n",
|
||
|
" [ 1.3321e+01, 2.3184e+01]]]])\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"sr, song = wavfile.read('./BAC009S0764W0124.wav') # Loading your audio\n",
|
||
|
"print(sr)\n",
|
||
|
"print(song)\n",
|
||
|
"print(song.shape)\n",
|
||
|
"print(song.dtype)\n",
|
||
|
"x = song\n",
|
||
|
"x = torch.tensor(x).float() # casting the array into a PyTorch Tensor\n",
|
||
|
"\n",
|
||
|
"spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, hop_length=512,\n",
|
||
|
" window='hann', freq_scale='linear', center=True, pad_mode='reflect',\n",
|
||
|
" fmin=50,fmax=8000, sr=sr) # Initializing the model\n",
|
||
|
"\n",
|
||
|
"spec = spec_layer(x) # Feed-forward your waveform to get the spectrogram\n",
|
||
|
"print(spec)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"id": "finished-sterling",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"16000\n",
|
||
|
"[43 75 69 ... 7 6 3]\n",
|
||
|
"(83792,)\n",
|
||
|
"int16\n",
|
||
|
"True\n",
|
||
|
"sampling rate = 16000. Please make sure the sampling rate is correct in order toget a valid freq range\n",
|
||
|
"STFT kernels created, time used = 0.2495 seconds\n",
|
||
|
"torch.Size([1, 1025, 164, 2])\n",
|
||
|
"tensor([[[[-4.0940e+03, 1.2600e+04],\n",
|
||
|
" [ 8.5108e+03, -5.4930e+03],\n",
|
||
|
" [-3.3631e+03, -1.7904e+03],\n",
|
||
|
" ...,\n",
|
||
|
" [ 8.2279e+03, -9.3340e+03],\n",
|
||
|
" [-3.1990e+03, 2.0969e+03],\n",
|
||
|
" [-1.2669e+03, 4.4488e+03]],\n",
|
||
|
"\n",
|
||
|
" [[ 3.4886e+03, -9.9620e+03],\n",
|
||
|
" [-4.5364e+03, 4.1907e+02],\n",
|
||
|
" [ 2.5074e+03, 7.1339e+03],\n",
|
||
|
" ...,\n",
|
||
|
" [-5.4819e+03, 3.9258e+01],\n",
|
||
|
" [ 4.7221e+03, 6.5887e+01],\n",
|
||
|
" [ 9.6492e+02, -3.4386e+03]],\n",
|
||
|
"\n",
|
||
|
" [[-3.4947e+03, 9.2981e+03],\n",
|
||
|
" [-7.5164e+03, 8.1856e+02],\n",
|
||
|
" [-5.3766e+03, -9.0889e+03],\n",
|
||
|
" ...,\n",
|
||
|
" [ 1.4317e+03, 5.7447e+03],\n",
|
||
|
" [-3.1178e+03, 3.0740e+03],\n",
|
||
|
" [-3.4351e+03, 5.6900e+02]],\n",
|
||
|
"\n",
|
||
|
" ...,\n",
|
||
|
"\n",
|
||
|
" [[ 6.7112e+01, -4.5737e+00],\n",
|
||
|
" [-9.6295e+00, 3.5554e+01],\n",
|
||
|
" [ 1.8527e+00, -1.0491e+01],\n",
|
||
|
" ...,\n",
|
||
|
" [-1.1157e+01, 3.4423e+00],\n",
|
||
|
" [ 3.1193e+00, -4.4388e+00],\n",
|
||
|
" [-8.8242e+00, 8.0324e+00]],\n",
|
||
|
"\n",
|
||
|
" [[-6.5080e+01, 2.9543e+00],\n",
|
||
|
" [ 3.9992e+01, -1.3836e+01],\n",
|
||
|
" [-9.2803e+00, 1.0318e+01],\n",
|
||
|
" ...,\n",
|
||
|
" [ 4.2928e+00, 9.2397e+00],\n",
|
||
|
" [ 3.6642e+00, 9.4680e+00],\n",
|
||
|
" [ 4.8932e+00, -2.5199e+01]],\n",
|
||
|
"\n",
|
||
|
" [[ 4.7264e+01, -1.0721e+00],\n",
|
||
|
" [-6.0516e+00, -1.4589e+01],\n",
|
||
|
" [ 1.3127e+01, 1.4995e+00],\n",
|
||
|
" ...,\n",
|
||
|
" [ 1.7333e+01, -1.4380e+01],\n",
|
||
|
" [-3.6046e+00, -6.1019e+00],\n",
|
||
|
" [ 1.3321e+01, 2.3184e+01]]]])\n",
|
||
|
"True\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"wav, sr = sf.read('./BAC009S0764W0124.wav', dtype='int16')\n",
|
||
|
"print(sr)\n",
|
||
|
"print(wav)\n",
|
||
|
"print(wav.shape)\n",
|
||
|
"print(wav.dtype)\n",
|
||
|
"print(np.allclose(wav, song))\n",
|
||
|
"\n",
|
||
|
"x = wav\n",
|
||
|
"x = torch.tensor(x).float() # casting the array into a PyTorch Tensor\n",
|
||
|
"\n",
|
||
|
"spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, hop_length=512,\n",
|
||
|
" window='hann', freq_scale='linear', center=True, pad_mode='reflect',\n",
|
||
|
" fmin=50,fmax=8000, sr=sr) # Initializing the model\n",
|
||
|
"\n",
|
||
|
"wav_spec = spec_layer(x) # Feed-forward your waveform to get the spectrogram\n",
|
||
|
"print(wav_spec.shape)\n",
|
||
|
"print(wav_spec)\n",
|
||
|
"print(np.allclose(wav_spec, spec))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "running-technology",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.7.0"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|