diff --git a/audio/paddleaudio/datasets/__init__.py b/audio/paddleaudio/datasets/__init__.py index ebd4af98..f95fad30 100644 --- a/audio/paddleaudio/datasets/__init__.py +++ b/audio/paddleaudio/datasets/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from .esc50 import ESC50 from .gtzan import GTZAN +from .hey_snips import HeySnips from .rirs_noises import OpenRIRNoise from .tess import TESS from .urban_sound import UrbanSound8K diff --git a/audio/paddleaudio/datasets/dataset.py b/audio/paddleaudio/datasets/dataset.py index 06e2df6d..488187a6 100644 --- a/audio/paddleaudio/datasets/dataset.py +++ b/audio/paddleaudio/datasets/dataset.py @@ -17,6 +17,8 @@ import numpy as np import paddle from ..backends import load as load_audio +from ..compliance.kaldi import fbank as kaldi_fbank +from ..compliance.kaldi import mfcc as kaldi_mfcc from ..compliance.librosa import melspectrogram from ..compliance.librosa import mfcc @@ -24,6 +26,8 @@ feat_funcs = { 'raw': None, 'melspectrogram': melspectrogram, 'mfcc': mfcc, + 'kaldi_fbank': kaldi_fbank, + 'kaldi_mfcc': kaldi_mfcc, } @@ -73,16 +77,24 @@ class AudioClassificationDataset(paddle.io.Dataset): feat_func = feat_funcs[self.feat_type] record = {} - record['feat'] = feat_func( - waveform, sample_rate, - **self.feat_config) if feat_func else waveform + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T) + record['feat'] = feat_func( + waveform=waveform, sr=self.sample_rate, **self.feat_config) + else: + record['feat'] = feat_func( + waveform, sample_rate, + **self.feat_config) if feat_func else waveform record['label'] = label return record def __getitem__(self, idx): record = self._convert_to_record(idx) - return np.array(record['feat']).transpose(), np.array( - record['label'], dtype=np.int64) + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + return self.keys[idx], record['feat'], record['label'] + else: + return np.array(record['feat']).transpose(), np.array( + record['label'], dtype=np.int64) def __len__(self): return len(self.files) diff --git a/audio/paddleaudio/datasets/hey_snips.py b/audio/paddleaudio/datasets/hey_snips.py new file mode 100644 index 00000000..53aebdf8 --- /dev/null +++ b/audio/paddleaudio/datasets/hey_snips.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import json +import os +from typing import List +from typing import Tuple + +from .dataset import AudioClassificationDataset + +__all__ = ['HeySnips'] + + +class HeySnips(AudioClassificationDataset): + meta_info = collections.namedtuple('META_INFO', + ('key', 'label', 'duration', 'wav')) + + def __init__(self, + data_dir: os.PathLike, + mode: str='train', + feat_type: str='kaldi_fbank', + sample_rate: int=16000, + **kwargs): + self.data_dir = data_dir + files, labels = self._get_data(mode) + super(HeySnips, self).__init__( + files=files, + labels=labels, + feat_type=feat_type, + sample_rate=sample_rate, + **kwargs) + + def _get_meta_info(self, mode) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(self.data_dir, '{}.json'.format(mode)), + 'r') as f: + data = json.load(f) + for item in data: + sample = collections.OrderedDict() + if item['duration'] > 0: + sample['key'] = item['id'] + sample['label'] = 0 if item['is_hotword'] == 1 else -1 + sample['duration'] = item['duration'] + sample['wav'] = os.path.join(self.data_dir, + item['audio_file_path']) + ret.append(self.meta_info(*sample.values())) + return ret + + def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: + meta_info = self._get_meta_info(mode) + + files = [] + labels = [] + self.keys = [] + for sample in meta_info: + key, target, _, wav = sample + files.append(wav) + labels.append(int(target)) + self.keys.append(key) + + return files, labels diff --git a/paddlespeech/kws/__init__.py b/paddlespeech/kws/__init__.py index 97043fd7..9c6e278e 100644 --- a/paddlespeech/kws/__init__.py +++ b/paddlespeech/kws/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .models.mdtc import MDTC diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py index 25b79baf..2cb14305 100644 --- a/paddlespeech/kws/models/mdtc.py +++ b/paddlespeech/kws/models/mdtc.py @@ -179,6 +179,7 @@ class MDTC(nn.Layer): causal)) self.receptive_fields += self.blocks[-1].receptive_fields self.half_receptive_fields = self.receptive_fields // 2 + self.hidden_dim = res_channels def forward(self, x: paddle.Tensor): if self.causal: @@ -216,3 +217,30 @@ class MDTC(nn.Layer): outputs += x outputs = outputs.transpose([0, 2, 1]) return outputs, None + + +class KWSModel(nn.Layer): + def __init__(self, backbone, num_keywords): + super(KWSModel, self).__init__() + self.backbone = backbone + self.linear = nn.Linear(self.backbone.hidden_dim, num_keywords) + self.activation = nn.Sigmoid() + + def forward(self, x): + outputs = self.backbone(x) + outputs = self.linear(outputs) + return self.activation(outputs) + + +if __name__ == '__main__': + paddle.set_device('cpu') + from paddleaudio.features import LogMelSpectrogram + mdtc = MDTC(3, 4, 80, 32, 5, causal=True) + + x = paddle.randn(shape=(32, 16000 * 5)) + feature_extractor = LogMelSpectrogram(sr=16000, n_fft=512, n_mels=80) + feats = feature_extractor(x).transpose([0, 2, 1]) + print(feats.shape) + + res, _ = mdtc(feats) + print(res.shape)