Add KWS example.

pull/1558/head
KP 2 years ago
parent 521e222db8
commit e01abc5099

@ -13,6 +13,7 @@
# limitations under the License.
from .esc50 import ESC50
from .gtzan import GTZAN
from .hey_snips import HeySnips
from .rirs_noises import OpenRIRNoise
from .tess import TESS
from .urban_sound import UrbanSound8K

@ -17,6 +17,8 @@ import numpy as np
import paddle
from ..backends import load as load_audio
from ..compliance.kaldi import fbank as kaldi_fbank
from ..compliance.kaldi import mfcc as kaldi_mfcc
from ..compliance.librosa import melspectrogram
from ..compliance.librosa import mfcc
@ -24,6 +26,8 @@ feat_funcs = {
'raw': None,
'melspectrogram': melspectrogram,
'mfcc': mfcc,
'kaldi_fbank': kaldi_fbank,
'kaldi_mfcc': kaldi_mfcc,
}
@ -73,16 +77,24 @@ class AudioClassificationDataset(paddle.io.Dataset):
feat_func = feat_funcs[self.feat_type]
record = {}
record['feat'] = feat_func(
waveform, sample_rate,
**self.feat_config) if feat_func else waveform
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T)
record['feat'] = feat_func(
waveform=waveform, sr=self.sample_rate, **self.feat_config)
else:
record['feat'] = feat_func(
waveform, sample_rate,
**self.feat_config) if feat_func else waveform
record['label'] = label
return record
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return np.array(record['feat']).transpose(), np.array(
record['label'], dtype=np.int64)
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
return self.keys[idx], record['feat'], record['label']
else:
return np.array(record['feat']).transpose(), np.array(
record['label'], dtype=np.int64)
def __len__(self):
return len(self.files)

@ -0,0 +1,72 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import os
from typing import List
from typing import Tuple
from .dataset import AudioClassificationDataset
__all__ = ['HeySnips']
class HeySnips(AudioClassificationDataset):
meta_info = collections.namedtuple('META_INFO',
('key', 'label', 'duration', 'wav'))
def __init__(self,
data_dir: os.PathLike,
mode: str='train',
feat_type: str='kaldi_fbank',
sample_rate: int=16000,
**kwargs):
self.data_dir = data_dir
files, labels = self._get_data(mode)
super(HeySnips, self).__init__(
files=files,
labels=labels,
feat_type=feat_type,
sample_rate=sample_rate,
**kwargs)
def _get_meta_info(self, mode) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
'r') as f:
data = json.load(f)
for item in data:
sample = collections.OrderedDict()
if item['duration'] > 0:
sample['key'] = item['id']
sample['label'] = 0 if item['is_hotword'] == 1 else -1
sample['duration'] = item['duration']
sample['wav'] = os.path.join(self.data_dir,
item['audio_file_path'])
ret.append(self.meta_info(*sample.values()))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
meta_info = self._get_meta_info(mode)
files = []
labels = []
self.keys = []
for sample in meta_info:
key, target, _, wav = sample
files.append(wav)
labels.append(int(target))
self.keys.append(key)
return files, labels

@ -11,3 +11,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .models.mdtc import MDTC

@ -179,6 +179,7 @@ class MDTC(nn.Layer):
causal))
self.receptive_fields += self.blocks[-1].receptive_fields
self.half_receptive_fields = self.receptive_fields // 2
self.hidden_dim = res_channels
def forward(self, x: paddle.Tensor):
if self.causal:
@ -216,3 +217,30 @@ class MDTC(nn.Layer):
outputs += x
outputs = outputs.transpose([0, 2, 1])
return outputs, None
class KWSModel(nn.Layer):
def __init__(self, backbone, num_keywords):
super(KWSModel, self).__init__()
self.backbone = backbone
self.linear = nn.Linear(self.backbone.hidden_dim, num_keywords)
self.activation = nn.Sigmoid()
def forward(self, x):
outputs = self.backbone(x)
outputs = self.linear(outputs)
return self.activation(outputs)
if __name__ == '__main__':
paddle.set_device('cpu')
from paddleaudio.features import LogMelSpectrogram
mdtc = MDTC(3, 4, 80, 32, 5, causal=True)
x = paddle.randn(shape=(32, 16000 * 5))
feature_extractor = LogMelSpectrogram(sr=16000, n_fft=512, n_mels=80)
feats = feature_extractor(x).transpose([0, 2, 1])
print(feats.shape)
res, _ = mdtc(feats)
print(res.shape)

Loading…
Cancel
Save