From 521e222db8eab16754d4e7f9985924a317add2fd Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 11 Mar 2022 17:35:51 +0800 Subject: [PATCH 1/6] Add mdtc model. --- paddlespeech/kws/__init__.py | 13 ++ paddlespeech/kws/models/__init__.py | 13 ++ paddlespeech/kws/models/mdtc.py | 218 ++++++++++++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 paddlespeech/kws/__init__.py create mode 100644 paddlespeech/kws/models/__init__.py create mode 100644 paddlespeech/kws/models/mdtc.py diff --git a/paddlespeech/kws/__init__.py b/paddlespeech/kws/__init__.py new file mode 100644 index 00000000..97043fd7 --- /dev/null +++ b/paddlespeech/kws/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/kws/models/__init__.py b/paddlespeech/kws/models/__init__.py new file mode 100644 index 00000000..97043fd7 --- /dev/null +++ b/paddlespeech/kws/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py new file mode 100644 index 00000000..25b79baf --- /dev/null +++ b/paddlespeech/kws/models/mdtc.py @@ -0,0 +1,218 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class DSDilatedConv1d(nn.Layer): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + dilation: int=1, + stride: int=1, + bias: bool=True, ): + super(DSDilatedConv1d, self).__init__() + self.receptive_fields = dilation * (kernel_size - 1) + self.conv = nn.Conv1D( + in_channels, + in_channels, + kernel_size, + padding=0, + dilation=dilation, + stride=stride, + groups=in_channels, + bias_attr=bias, ) + self.bn = nn.BatchNorm1D(in_channels) + self.pointwise = nn.Conv1D( + in_channels, + out_channels, + kernel_size=1, + padding=0, + dilation=1, + bias_attr=bias) + + def forward(self, inputs: paddle.Tensor): + outputs = self.conv(inputs) + outputs = self.bn(outputs) + outputs = self.pointwise(outputs) + return outputs + + +class TCNBlock(nn.Layer): + def __init__( + self, + in_channels: int, + res_channels: int, + kernel_size: int, + dilation: int, + causal: bool, ): + super(TCNBlock, self).__init__() + self.in_channels = in_channels + self.res_channels = res_channels + self.kernel_size = kernel_size + self.dilation = dilation + self.causal = causal + self.receptive_fields = dilation * (kernel_size - 1) + self.half_receptive_fields = self.receptive_fields // 2 + self.conv1 = DSDilatedConv1d( + in_channels=in_channels, + out_channels=res_channels, + kernel_size=kernel_size, + dilation=dilation, ) + self.bn1 = nn.BatchNorm1D(res_channels) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv1D( + in_channels=res_channels, out_channels=res_channels, kernel_size=1) + self.bn2 = nn.BatchNorm1D(res_channels) + self.relu2 = nn.ReLU() + + def forward(self, inputs: paddle.Tensor): + outputs = self.relu1(self.bn1(self.conv1(inputs))) + outputs = self.bn2(self.conv2(outputs)) + if self.causal: + inputs = inputs[:, :, self.receptive_fields:] + else: + inputs = inputs[:, :, self.half_receptive_fields: + -self.half_receptive_fields] + if self.in_channels == self.res_channels: + res_out = self.relu2(outputs + inputs) + else: + res_out = self.relu2(outputs) + return res_out + + +class TCNStack(nn.Layer): + def __init__( + self, + in_channels: int, + stack_num: int, + stack_size: int, + res_channels: int, + kernel_size: int, + causal: bool, ): + super(TCNStack, self).__init__() + self.in_channels = in_channels + self.stack_num = stack_num + self.stack_size = stack_size + self.res_channels = res_channels + self.kernel_size = kernel_size + self.causal = causal + self.res_blocks = self.stack_tcn_blocks() + self.receptive_fields = self.calculate_receptive_fields() + self.res_blocks = nn.Sequential(*self.res_blocks) + + def calculate_receptive_fields(self): + receptive_fields = 0 + for block in self.res_blocks: + receptive_fields += block.receptive_fields + return receptive_fields + + def build_dilations(self): + dilations = [] + for s in range(0, self.stack_size): + for l in range(0, self.stack_num): + dilations.append(2**l) + return dilations + + def stack_tcn_blocks(self): + dilations = self.build_dilations() + res_blocks = nn.LayerList() + + res_blocks.append( + TCNBlock( + self.in_channels, + self.res_channels, + self.kernel_size, + dilations[0], + self.causal, )) + for dilation in dilations[1:]: + res_blocks.append( + TCNBlock( + self.res_channels, + self.res_channels, + self.kernel_size, + dilation, + self.causal, )) + return res_blocks + + def forward(self, inputs: paddle.Tensor): + outputs = self.res_blocks(inputs) + return outputs + + +class MDTC(nn.Layer): + def __init__( + self, + stack_num: int, + stack_size: int, + in_channels: int, + res_channels: int, + kernel_size: int, + causal: bool, ): + super(MDTC, self).__init__() + assert kernel_size % 2 == 1 + self.kernel_size = kernel_size + self.causal = causal + self.preprocessor = TCNBlock( + in_channels, res_channels, kernel_size, dilation=1, causal=causal) + self.relu = nn.ReLU() + self.blocks = nn.LayerList() + self.receptive_fields = self.preprocessor.receptive_fields + for i in range(stack_num): + self.blocks.append( + TCNStack(res_channels, stack_size, 1, res_channels, kernel_size, + causal)) + self.receptive_fields += self.blocks[-1].receptive_fields + self.half_receptive_fields = self.receptive_fields // 2 + + def forward(self, x: paddle.Tensor): + if self.causal: + outputs = F.pad(x, (0, 0, self.receptive_fields, 0, 0, 0), + 'constant') + else: + outputs = F.pad( + x, + (0, 0, self.half_receptive_fields, self.half_receptive_fields, + 0, 0), + 'constant', ) + outputs = outputs.transpose([0, 2, 1]) + outputs_list = [] + outputs = self.relu(self.preprocessor(outputs)) + for block in self.blocks: + outputs = block(outputs) + outputs_list.append(outputs) + + normalized_outputs = [] + output_size = outputs_list[-1].shape[-1] + for x in outputs_list: + remove_length = x.shape[-1] - output_size + if self.causal and remove_length > 0: + normalized_outputs.append(x[:, :, remove_length:]) + elif not self.causal and remove_length > 1: + half_remove_length = remove_length // 2 + normalized_outputs.append( + x[:, :, half_remove_length:-half_remove_length]) + else: + normalized_outputs.append(x) + + outputs = paddle.zeros_like( + outputs_list[-1], dtype=outputs_list[-1].dtype) + for x in normalized_outputs: + outputs += x + outputs = outputs.transpose([0, 2, 1]) + return outputs, None From e01abc50991df40033144a5649c670aaeae47ba8 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 7 Apr 2022 19:48:18 +0800 Subject: [PATCH 2/6] Add KWS example. --- audio/paddleaudio/datasets/__init__.py | 1 + audio/paddleaudio/datasets/dataset.py | 22 ++++++-- audio/paddleaudio/datasets/hey_snips.py | 72 +++++++++++++++++++++++++ paddlespeech/kws/__init__.py | 1 + paddlespeech/kws/models/mdtc.py | 28 ++++++++++ 5 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 audio/paddleaudio/datasets/hey_snips.py diff --git a/audio/paddleaudio/datasets/__init__.py b/audio/paddleaudio/datasets/__init__.py index ebd4af98..f95fad30 100644 --- a/audio/paddleaudio/datasets/__init__.py +++ b/audio/paddleaudio/datasets/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from .esc50 import ESC50 from .gtzan import GTZAN +from .hey_snips import HeySnips from .rirs_noises import OpenRIRNoise from .tess import TESS from .urban_sound import UrbanSound8K diff --git a/audio/paddleaudio/datasets/dataset.py b/audio/paddleaudio/datasets/dataset.py index 06e2df6d..488187a6 100644 --- a/audio/paddleaudio/datasets/dataset.py +++ b/audio/paddleaudio/datasets/dataset.py @@ -17,6 +17,8 @@ import numpy as np import paddle from ..backends import load as load_audio +from ..compliance.kaldi import fbank as kaldi_fbank +from ..compliance.kaldi import mfcc as kaldi_mfcc from ..compliance.librosa import melspectrogram from ..compliance.librosa import mfcc @@ -24,6 +26,8 @@ feat_funcs = { 'raw': None, 'melspectrogram': melspectrogram, 'mfcc': mfcc, + 'kaldi_fbank': kaldi_fbank, + 'kaldi_mfcc': kaldi_mfcc, } @@ -73,16 +77,24 @@ class AudioClassificationDataset(paddle.io.Dataset): feat_func = feat_funcs[self.feat_type] record = {} - record['feat'] = feat_func( - waveform, sample_rate, - **self.feat_config) if feat_func else waveform + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T) + record['feat'] = feat_func( + waveform=waveform, sr=self.sample_rate, **self.feat_config) + else: + record['feat'] = feat_func( + waveform, sample_rate, + **self.feat_config) if feat_func else waveform record['label'] = label return record def __getitem__(self, idx): record = self._convert_to_record(idx) - return np.array(record['feat']).transpose(), np.array( - record['label'], dtype=np.int64) + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + return self.keys[idx], record['feat'], record['label'] + else: + return np.array(record['feat']).transpose(), np.array( + record['label'], dtype=np.int64) def __len__(self): return len(self.files) diff --git a/audio/paddleaudio/datasets/hey_snips.py b/audio/paddleaudio/datasets/hey_snips.py new file mode 100644 index 00000000..53aebdf8 --- /dev/null +++ b/audio/paddleaudio/datasets/hey_snips.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import json +import os +from typing import List +from typing import Tuple + +from .dataset import AudioClassificationDataset + +__all__ = ['HeySnips'] + + +class HeySnips(AudioClassificationDataset): + meta_info = collections.namedtuple('META_INFO', + ('key', 'label', 'duration', 'wav')) + + def __init__(self, + data_dir: os.PathLike, + mode: str='train', + feat_type: str='kaldi_fbank', + sample_rate: int=16000, + **kwargs): + self.data_dir = data_dir + files, labels = self._get_data(mode) + super(HeySnips, self).__init__( + files=files, + labels=labels, + feat_type=feat_type, + sample_rate=sample_rate, + **kwargs) + + def _get_meta_info(self, mode) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(self.data_dir, '{}.json'.format(mode)), + 'r') as f: + data = json.load(f) + for item in data: + sample = collections.OrderedDict() + if item['duration'] > 0: + sample['key'] = item['id'] + sample['label'] = 0 if item['is_hotword'] == 1 else -1 + sample['duration'] = item['duration'] + sample['wav'] = os.path.join(self.data_dir, + item['audio_file_path']) + ret.append(self.meta_info(*sample.values())) + return ret + + def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: + meta_info = self._get_meta_info(mode) + + files = [] + labels = [] + self.keys = [] + for sample in meta_info: + key, target, _, wav = sample + files.append(wav) + labels.append(int(target)) + self.keys.append(key) + + return files, labels diff --git a/paddlespeech/kws/__init__.py b/paddlespeech/kws/__init__.py index 97043fd7..9c6e278e 100644 --- a/paddlespeech/kws/__init__.py +++ b/paddlespeech/kws/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .models.mdtc import MDTC diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py index 25b79baf..2cb14305 100644 --- a/paddlespeech/kws/models/mdtc.py +++ b/paddlespeech/kws/models/mdtc.py @@ -179,6 +179,7 @@ class MDTC(nn.Layer): causal)) self.receptive_fields += self.blocks[-1].receptive_fields self.half_receptive_fields = self.receptive_fields // 2 + self.hidden_dim = res_channels def forward(self, x: paddle.Tensor): if self.causal: @@ -216,3 +217,30 @@ class MDTC(nn.Layer): outputs += x outputs = outputs.transpose([0, 2, 1]) return outputs, None + + +class KWSModel(nn.Layer): + def __init__(self, backbone, num_keywords): + super(KWSModel, self).__init__() + self.backbone = backbone + self.linear = nn.Linear(self.backbone.hidden_dim, num_keywords) + self.activation = nn.Sigmoid() + + def forward(self, x): + outputs = self.backbone(x) + outputs = self.linear(outputs) + return self.activation(outputs) + + +if __name__ == '__main__': + paddle.set_device('cpu') + from paddleaudio.features import LogMelSpectrogram + mdtc = MDTC(3, 4, 80, 32, 5, causal=True) + + x = paddle.randn(shape=(32, 16000 * 5)) + feature_extractor = LogMelSpectrogram(sr=16000, n_fft=512, n_mels=80) + feats = feature_extractor(x).transpose([0, 2, 1]) + print(feats.shape) + + res, _ = mdtc(feats) + print(res.shape) From b60b1daddeffd53be8c158e045049b94ae1d1ae9 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 8 Apr 2022 21:35:56 +0800 Subject: [PATCH 3/6] Add KWS example. --- examples/hey_snips/README.md | 0 examples/hey_snips/kws0/RESULTS.md | 0 examples/hey_snips/kws0/path.sh | 28 +++ examples/hey_snips/kws0/run.sh | 47 +++++ paddlespeech/kws/exps/mdtc/compute_det.py | 121 +++++++++++ paddlespeech/kws/exps/mdtc/plot_det_curve.py | 63 ++++++ paddlespeech/kws/exps/mdtc/score.py | 103 ++++++++++ paddlespeech/kws/exps/mdtc/train.py | 205 +++++++++++++++++++ 8 files changed, 567 insertions(+) create mode 100644 examples/hey_snips/README.md create mode 100644 examples/hey_snips/kws0/RESULTS.md create mode 100755 examples/hey_snips/kws0/path.sh create mode 100755 examples/hey_snips/kws0/run.sh create mode 100644 paddlespeech/kws/exps/mdtc/compute_det.py create mode 100644 paddlespeech/kws/exps/mdtc/plot_det_curve.py create mode 100644 paddlespeech/kws/exps/mdtc/score.py create mode 100644 paddlespeech/kws/exps/mdtc/train.py diff --git a/examples/hey_snips/README.md b/examples/hey_snips/README.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/hey_snips/kws0/RESULTS.md b/examples/hey_snips/kws0/RESULTS.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/hey_snips/kws0/path.sh b/examples/hey_snips/kws0/path.sh new file mode 100755 index 00000000..54a430d4 --- /dev/null +++ b/examples/hey_snips/kws0/path.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +MODEL=mdtc +export BIN_DIR=${MAIN_ROOT}/paddlespeech/kws/exps/${MODEL} \ No newline at end of file diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh new file mode 100755 index 00000000..69b1ad6a --- /dev/null +++ b/examples/hey_snips/kws0/run.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +. ./path.sh +set -e + +stage=0 +stop_stage=50 + +# data directory +# if we set the variable ${dir}, we will store the wav info to this directory +# otherwise, we will store the wav info to vox1 and vox2 directory respectively +# vox2 wav path, we must convert the m4a format to wav format +dir=data/ # data info directory + +exp_dir=exp/ecapa-tdnn-vox12-big/ # experiment directory +conf_path=conf/mdtc.yaml +gpus=0,1,2,3 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +mkdir -p ${exp_dir} + +if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav + bash ./local/data.sh ${dir} ${conf_path}|| exit -1; +fi + +if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} +fi + +if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path} +fi diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py new file mode 100644 index 00000000..19a3fe14 --- /dev/null +++ b/paddlespeech/kws/exps/mdtc/compute_det.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +import sys + +from tqdm import tqdm + + +def load_label_and_score(keyword, label_file, score_file): + # score_table: {uttid: [keywordlist]} + score_table = {} + with open(score_file, 'r', encoding='utf8') as fin: + for line in fin: + arr = line.strip().split() + key = arr[0] + current_keyword = arr[1] + str_list = arr[2:] + if int(current_keyword) == keyword: + scores = list(map(float, str_list)) + if key not in score_table: + score_table.update({key: scores}) + keyword_table = {} + filler_table = {} + filler_duration = 0.0 + with open(label_file, 'r', encoding='utf8') as fin: + for line in fin: + obj = json.loads(line.strip()) + assert 'key' in obj + assert 'txt' in obj + assert 'duration' in obj + key = obj['key'] + index = obj['txt'] + duration = obj['duration'] + assert key in score_table + if index == keyword: + keyword_table[key] = score_table[key] + else: + filler_table[key] = score_table[key] + filler_duration += duration + return keyword_table, filler_table, filler_duration + + +class Args: + def __init__(self): + self.test_data = '/ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddlespeech/kws/models/data/test/data.list' + self.keyword = 0 + self.score_file = os.path.join( + os.path.abspath(sys.argv[1]), 'score.txt') + self.stats_file = os.path.join( + os.path.abspath(sys.argv[1]), 'stats.0.txt') + self.step = 0.01 + self.window_shift = 50 + + +args = Args() + +if __name__ == '__main__': + # parser = argparse.ArgumentParser(description='compute det curve') + # parser.add_argument('--test_data', required=True, help='label file') + # parser.add_argument('--keyword', type=int, default=0, help='keyword label') + # parser.add_argument('--score_file', required=True, help='score file') + # parser.add_argument('--step', type=float, default=0.01, + # help='threshold step') + # parser.add_argument('--window_shift', type=int, default=50, + # help='window_shift is used to skip the frames after triggered') + # parser.add_argument('--stats_file', + # required=True, + # help='false reject/alarm stats file') + # args = parser.parse_args() + + window_shift = args.window_shift + keyword_table, filler_table, filler_duration = load_label_and_score( + args.keyword, args.test_data, args.score_file) + print('Filler total duration Hours: {}'.format(filler_duration / 3600.0)) + pbar = tqdm(total=int(1.0 / args.step)) + with open(args.stats_file, 'w', encoding='utf8') as fout: + keyword_index = int(args.keyword) + threshold = 0.0 + while threshold <= 1.0: + num_false_reject = 0 + # transverse the all keyword_table + for key, score_list in keyword_table.items(): + # computer positive test sample, use the max score of list. + score = max(score_list) + if float(score) < threshold: + num_false_reject += 1 + num_false_alarm = 0 + # transverse the all filler_table + for key, score_list in filler_table.items(): + i = 0 + while i < len(score_list): + if score_list[i] >= threshold: + num_false_alarm += 1 + i += window_shift + else: + i += 1 + if len(keyword_table) != 0: + false_reject_rate = num_false_reject / len(keyword_table) + num_false_alarm = max(num_false_alarm, 1e-6) + if filler_duration != 0: + false_alarm_per_hour = num_false_alarm / \ + (filler_duration / 3600.0) + fout.write('{:.6f} {:.6f} {:.6f}\n'.format( + threshold, false_alarm_per_hour, false_reject_rate)) + threshold += args.step + pbar.update(1) + + pbar.close() + print('DET saved to: {}'.format(args.stats_file)) diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py new file mode 100644 index 00000000..7986574f --- /dev/null +++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py @@ -0,0 +1,63 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +import matplotlib.pyplot as plt +import numpy as np + + +def load_stats_file(stats_file): + values = [] + with open(stats_file, 'r', encoding='utf8') as fin: + for line in fin: + arr = line.strip().split() + threshold, fa_per_hour, frr = arr + values.append([float(fa_per_hour), float(frr) * 100]) + values.reverse() + return np.array(values) + + +def plot_det_curve(keywords, stats_dir, figure_file, xlim, x_step, ylim, + y_step): + plt.figure(dpi=200) + plt.rcParams['xtick.direction'] = 'in' + plt.rcParams['ytick.direction'] = 'in' + plt.rcParams['font.size'] = 12 + + for index, keyword in enumerate(keywords): + stats_file = os.path.join(stats_dir, 'stats.' + str(index) + '.txt') + values = load_stats_file(stats_file) + plt.plot(values[:, 0], values[:, 1], label=keyword) + + plt.xlim([0, xlim]) + plt.ylim([0, ylim]) + plt.xticks(range(0, xlim + x_step, x_step)) + plt.yticks(range(0, ylim + y_step, y_step)) + plt.xlabel('False Alarm Per Hour') + plt.ylabel('False Rejection Rate (\\%)') + plt.grid(linestyle='--') + plt.legend(loc='best', fontsize=16) + plt.savefig(figure_file) + + +if __name__ == '__main__': + + keywords = ['Hey_Snips'] + img_path = os.path.join(os.path.abspath(sys.argv[1]), 'det.png') + + plot_det_curve(keywords, + os.path.abspath(sys.argv[1]), img_path, 10, 2, 10, 2) + + print('DET curve image saved to: {}'.format(img_path)) diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py new file mode 100644 index 00000000..9fdbcf49 --- /dev/null +++ b/paddlespeech/kws/exps/mdtc/score.py @@ -0,0 +1,103 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import time + +import paddle +from mdtc import KWSModel +from mdtc import MDTC +from tqdm import tqdm + +from paddleaudio.datasets import HeySnips + + +def collate_features(batch): + # (key, feat, label) in one sample + collate_start = time.time() + keys = [] + feats = [] + labels = [] + lengths = [] + for sample in batch: + keys.append(sample[0]) + feats.append(sample[1]) + labels.append(sample[2]) + lengths.append(sample[1].shape[0]) + + max_length = max(lengths) + for i in range(len(feats)): + feats[i] = paddle.nn.functional.pad( + feats[i], [0, max_length - feats[i].shape[0], 0, 0], + data_format='NLC') + + return keys, paddle.stack(feats), paddle.to_tensor( + labels), paddle.to_tensor(lengths) + + +if __name__ == '__main__': + # Dataset + feat_conf = { + # 'n_mfcc': 80, + 'n_mels': 80, + 'frame_shift': 10, + 'frame_length': 25, + # 'dither': 1.0, + } + test_ds = HeySnips( + mode='test', feat_type='kaldi_fbank', sample_rate=16000, **feat_conf) + test_sampler = paddle.io.BatchSampler( + test_ds, batch_size=32, drop_last=False) + test_loader = paddle.io.DataLoader( + test_ds, + batch_sampler=test_sampler, + num_workers=16, + return_list=True, + use_buffer_reader=True, + collate_fn=collate_features, ) + + # Model + backbone = MDTC( + stack_num=3, + stack_size=4, + in_channels=80, + res_channels=32, + kernel_size=5, + causal=True, ) + model = KWSModel(backbone=backbone, num_keywords=1) + model = paddle.DataParallel(model) + # kws_checkpoint = '/ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddlespeech/kws/models/checkpoint/epoch_10_0.8903940343290826/model.pdparams' + kws_checkpoint = os.path.join( + os.path.abspath(sys.argv[1]), 'model.pdparams') + model.set_state_dict(paddle.load(kws_checkpoint)) + model.eval() + + score_abs_path = os.path.join(os.path.abspath(sys.argv[1]), 'score.txt') + with paddle.no_grad(), open(score_abs_path, 'w', encoding='utf8') as fout: + for batch_idx, batch in enumerate( + tqdm(test_loader, total=len(test_loader))): + keys, feats, labels, lengths = batch + logits = model(feats) + num_keywords = logits.shape[2] + for i in range(len(keys)): + key = keys[i] + score = logits[i][:lengths[i]] + for keyword_i in range(num_keywords): + keyword_scores = score[:, keyword_i] + score_frames = ' '.join( + ['{:.6f}'.format(x) for x in keyword_scores.tolist()]) + fout.write( + '{} {} {}\n'.format(key, keyword_i, score_frames)) + + print('Scores saved to: {}'.format(score_abs_path)) diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py new file mode 100644 index 00000000..17a9acfc --- /dev/null +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -0,0 +1,205 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time + +import paddle +from loss import max_pooling_loss +from mdtc import KWSModel +from mdtc import MDTC + +from paddleaudio.datasets import HeySnips +from paddleaudio.utils import logger +from paddleaudio.utils import Timer + + +def collate_features(batch): + # (key, feat, label) + collate_start = time.time() + keys = [] + feats = [] + labels = [] + lengths = [] + for sample in batch: + keys.append(sample[0]) + feats.append(sample[1]) + labels.append(sample[2]) + lengths.append(sample[1].shape[0]) + + max_length = max(lengths) + for i in range(len(feats)): + feats[i] = paddle.nn.functional.pad( + feats[i], [0, max_length - feats[i].shape[0], 0, 0], + data_format='NLC') + + return keys, paddle.stack(feats), paddle.to_tensor( + labels), paddle.to_tensor(lengths) + + +if __name__ == '__main__': + # Dataset + feat_conf = { + # 'n_mfcc': 80, + 'n_mels': 80, + 'frame_shift': 10, + 'frame_length': 25, + # 'dither': 1.0, + } + data_dir = '/ssd1/chenxiaojie06/datasets/hey_snips/hey_snips_research_6k_en_train_eval_clean_ter' + train_ds = HeySnips( + data_dir=data_dir, + mode='train', + feat_type='kaldi_fbank', + sample_rate=16000, + **feat_conf) + dev_ds = HeySnips( + data_dir=data_dir, + mode='dev', + feat_type='kaldi_fbank', + sample_rate=16000, + **feat_conf) + + training_conf = { + 'epochs': 100, + 'learning_rate': 0.001, + 'weight_decay': 0.00005, + 'num_workers': 16, + 'batch_size': 100, + 'checkpoint_dir': './checkpoint', + 'save_freq': 10, + 'log_freq': 10, + } + + train_sampler = paddle.io.BatchSampler( + train_ds, + batch_size=training_conf['batch_size'], + shuffle=True, + drop_last=False) + train_loader = paddle.io.DataLoader( + train_ds, + batch_sampler=train_sampler, + num_workers=training_conf['num_workers'], + return_list=True, + use_buffer_reader=True, + collate_fn=collate_features, ) + + # Model + backbone = MDTC( + stack_num=3, + stack_size=4, + in_channels=80, + res_channels=32, + kernel_size=5, + causal=True, ) + model = KWSModel(backbone=backbone, num_keywords=1) + model = paddle.DataParallel(model) + clip = paddle.nn.ClipGradByGlobalNorm(5.0) + optimizer = paddle.optimizer.Adam( + learning_rate=training_conf['learning_rate'], + weight_decay=training_conf['weight_decay'], + parameters=model.parameters(), + grad_clip=clip) + criterion = max_pooling_loss + + steps_per_epoch = len(train_sampler) + timer = Timer(steps_per_epoch * training_conf['epochs']) + timer.start() + + for epoch in range(1, training_conf['epochs'] + 1): + model.train() + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + batch_start = time.time() + for batch_idx, batch in enumerate(train_loader): + # print('Fetch one batch: {:.4f}'.format(time.time()-batch_start)) + keys, feats, labels, lengths = batch + logits = model(feats) + loss, corrects, acc = criterion(logits, labels, lengths) + loss.backward() + optimizer.step() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + optimizer.clear_grad() + + # Calculate loss + avg_loss += loss.numpy()[0] + + # Calculate metrics + num_corrects += corrects + num_samples += feats.shape[0] + + timer.count() + + if (batch_idx + 1) % training_conf['log_freq'] == 0: + lr = optimizer.get_lr() + avg_loss /= training_conf['log_freq'] + avg_acc = num_corrects / num_samples + + print_msg = 'Epoch={}/{}, Step={}/{}'.format( + epoch, training_conf['epochs'], batch_idx + 1, + steps_per_epoch) + print_msg += ' loss={:.4f}'.format(avg_loss) + print_msg += ' acc={:.4f}'.format(avg_acc) + print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( + lr, timer.timing, timer.eta) + logger.train(print_msg) + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + batch_start = time.time() + + if epoch % training_conf[ + 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch: + dev_sampler = paddle.io.BatchSampler( + dev_ds, + batch_size=training_conf['batch_size'], + shuffle=False, + drop_last=False) + dev_loader = paddle.io.DataLoader( + dev_ds, + batch_sampler=dev_sampler, + num_workers=training_conf['num_workers'], + return_list=True, + use_buffer_reader=True, + collate_fn=collate_features, ) + + model.eval() + num_corrects = 0 + num_samples = 0 + with logger.processing('Evaluation on validation dataset'): + for batch_idx, batch in enumerate(dev_loader): + keys, feats, labels, lengths = batch + logits = model(feats) + loss, corrects, acc = criterion(logits, labels, lengths) + num_corrects += corrects + num_samples += feats.shape[0] + + eval_acc = num_corrects / num_samples + print_msg = '[Evaluation result]' + print_msg += ' dev_acc={:.4f}'.format(eval_acc) + + logger.eval(print_msg) + + # Save model + save_dir = os.path.join(training_conf['checkpoint_dir'], + 'epoch_{}_{:.4f}'.format(epoch, eval_acc)) + logger.info('Saving model checkpoint to {}'.format(save_dir)) + paddle.save(model.state_dict(), + os.path.join(save_dir, 'model.pdparams')) + paddle.save(optimizer.state_dict(), + os.path.join(save_dir, 'model.pdopt')) From f9761d532ce2dd32d8f99f96794b1d7d5c6bf57e Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Tue, 19 Apr 2022 17:01:21 +0800 Subject: [PATCH 4/6] Add KWS example. --- audio/paddleaudio/datasets/hey_snips.py | 4 +- examples/hey_snips/README.md | 22 ++++ examples/hey_snips/RESULTS.md | 8 ++ examples/hey_snips/kws0/RESULTS.md | 0 examples/hey_snips/kws0/conf/mdtc.yaml | 39 +++++++ examples/hey_snips/kws0/local/plot.sh | 2 + examples/hey_snips/kws0/local/score.sh | 5 + examples/hey_snips/kws0/local/train.sh | 12 ++ examples/hey_snips/kws0/run.sh | 35 ++---- paddlespeech/kws/exps/mdtc/collate.py | 39 +++++++ paddlespeech/kws/exps/mdtc/compute_det.py | 90 +++++++-------- paddlespeech/kws/exps/mdtc/plot_det_curve.py | 29 +++-- paddlespeech/kws/exps/mdtc/score.py | 84 +++++--------- paddlespeech/kws/exps/mdtc/train.py | 111 +++++++------------ paddlespeech/kws/models/__init__.py | 2 + paddlespeech/kws/models/loss.py | 80 +++++++++++++ paddlespeech/kws/models/mdtc.py | 17 +-- 17 files changed, 354 insertions(+), 225 deletions(-) create mode 100644 examples/hey_snips/RESULTS.md delete mode 100644 examples/hey_snips/kws0/RESULTS.md create mode 100644 examples/hey_snips/kws0/conf/mdtc.yaml create mode 100755 examples/hey_snips/kws0/local/plot.sh create mode 100755 examples/hey_snips/kws0/local/score.sh create mode 100755 examples/hey_snips/kws0/local/train.sh create mode 100644 paddlespeech/kws/exps/mdtc/collate.py create mode 100644 paddlespeech/kws/models/loss.py diff --git a/audio/paddleaudio/datasets/hey_snips.py b/audio/paddleaudio/datasets/hey_snips.py index 53aebdf8..7a67b843 100644 --- a/audio/paddleaudio/datasets/hey_snips.py +++ b/audio/paddleaudio/datasets/hey_snips.py @@ -63,10 +63,12 @@ class HeySnips(AudioClassificationDataset): files = [] labels = [] self.keys = [] + self.durations = [] for sample in meta_info: - key, target, _, wav = sample + key, target, duration, wav = sample files.append(wav) labels.append(int(target)) self.keys.append(key) + self.durations.append(float(duration)) return files, labels diff --git a/examples/hey_snips/README.md b/examples/hey_snips/README.md index e69de29b..be8d142b 100644 --- a/examples/hey_snips/README.md +++ b/examples/hey_snips/README.md @@ -0,0 +1,22 @@ +# MDTC Keyword Spotting with HeySnips Dataset + +## Dataset + +Before running scripts, you **MUST** follow this instruction to download the dataset: https://github.com/sonos/keyword-spotting-research-datasets + +After you download and decompress the dataset archive, you should **REPLACE** the value of `data_dir` in `conf/*.yaml` to complete dataset config. + +## Get Started + +In this section, we will train the [MDTC](https://arxiv.org/pdf/2102.13552.pdf) model and evaluate on "Hey Snips" dataset. + +```sh +CUDA_VISIBLE_DEVICES=0,1 ./run.sh conf/mdtc.yaml +``` + +This script contains training and scoring steps. You can just set the `CUDA_VISIBLE_DEVICES` environment var to run on single gpu or multi-gpus. + +The vars `stage` and `stop_stage` in `./run.sh` controls the running steps: +- stage 1: Training from scratch. +- stage 2: Evaluating model on test dataset and computing detection error tradeoff(DET) of all trigger thresholds. +- stage 3: Plotting the DET cruve for visualizaiton. diff --git a/examples/hey_snips/RESULTS.md b/examples/hey_snips/RESULTS.md new file mode 100644 index 00000000..ba263906 --- /dev/null +++ b/examples/hey_snips/RESULTS.md @@ -0,0 +1,8 @@ + +## Metrics + +We mesure FRRs with fixing false alarms in one hour: + +|Model|False Alarm| False Reject Rate| +|--|--|--| +|MDTC| 1| 0.003559 | diff --git a/examples/hey_snips/kws0/RESULTS.md b/examples/hey_snips/kws0/RESULTS.md deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml new file mode 100644 index 00000000..3ce9f9d0 --- /dev/null +++ b/examples/hey_snips/kws0/conf/mdtc.yaml @@ -0,0 +1,39 @@ +data: + data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter' + dataset: 'paddleaudio.datasets:HeySnips' + +model: + num_keywords: 1 + backbone: 'paddlespeech.kws.models:MDTC' + config: + stack_num: 3 + stack_size: 4 + in_channels: 80 + res_channels: 32 + kernel_size: 5 + +feature: + feat_type: 'kaldi_fbank' + sample_rate: 16000 + frame_shift: 10 + frame_length: 25 + n_mels: 80 + +training: + epochs: 100 + num_workers: 16 + batch_size: 100 + checkpoint_dir: './checkpoint' + save_freq: 10 + log_freq: 10 + learning_rate: 0.001 + weight_decay: 0.00005 + grad_clip: 5.0 + +scoring: + batch_size: 100 + num_workers: 16 + checkpoint: './checkpoint/epoch_100/model.pdparams' + score_file: './scores.txt' + stats_file: './stats.0.txt' + img_file: './det.png' \ No newline at end of file diff --git a/examples/hey_snips/kws0/local/plot.sh b/examples/hey_snips/kws0/local/plot.sh new file mode 100755 index 00000000..5869e50b --- /dev/null +++ b/examples/hey_snips/kws0/local/plot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python3 ${BIN_DIR}/plot_det_curve.py --cfg_path=$1 --keyword HeySnips diff --git a/examples/hey_snips/kws0/local/score.sh b/examples/hey_snips/kws0/local/score.sh new file mode 100755 index 00000000..ed21d08c --- /dev/null +++ b/examples/hey_snips/kws0/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +python3 ${BIN_DIR}/score.py --cfg_path=$1 + +python3 ${BIN_DIR}/compute_det.py --cfg_path=$1 diff --git a/examples/hey_snips/kws0/local/train.sh b/examples/hey_snips/kws0/local/train.sh new file mode 100755 index 00000000..cab547b8 --- /dev/null +++ b/examples/hey_snips/kws0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +ngpu=$1 +cfg_path=$2 + +if [ ${ngpu} -gt 0 ]; then + python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ + --cfg_path ${cfg_path} +else + python3 ${BIN_DIR}/train.py \ + --cfg_path ${cfg_path} +fi diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh index 69b1ad6a..d6d1d878 100755 --- a/examples/hey_snips/kws0/run.sh +++ b/examples/hey_snips/kws0/run.sh @@ -13,35 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -. ./path.sh set -e +source path.sh -stage=0 -stop_stage=50 +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -# data directory -# if we set the variable ${dir}, we will store the wav info to this directory -# otherwise, we will store the wav info to vox1 and vox2 directory respectively -# vox2 wav path, we must convert the m4a format to wav format -dir=data/ # data info directory +stage=1 +stop_stage=3 -exp_dir=exp/ecapa-tdnn-vox12-big/ # experiment directory -conf_path=conf/mdtc.yaml -gpus=0,1,2,3 +cfg_path=$1 -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; - -mkdir -p ${exp_dir} - -if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav - bash ./local/data.sh ${dir} ${conf_path}|| exit -1; +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + ./local/train.sh ${ngpu} ${cfg_path} || exit -1 fi -if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then - CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ./local/score.sh ${cfg_path} || exit -1 fi -if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then - CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path} -fi +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ./local/plot.sh ${cfg_path} || exit -1 +fi \ No newline at end of file diff --git a/paddlespeech/kws/exps/mdtc/collate.py b/paddlespeech/kws/exps/mdtc/collate.py new file mode 100644 index 00000000..dcc81123 --- /dev/null +++ b/paddlespeech/kws/exps/mdtc/collate.py @@ -0,0 +1,39 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time + +import paddle + + +def collate_features(batch): + # (key, feat, label) + collate_start = time.time() + keys = [] + feats = [] + labels = [] + lengths = [] + for sample in batch: + keys.append(sample[0]) + feats.append(sample[1]) + labels.append(sample[2]) + lengths.append(sample[1].shape[0]) + + max_length = max(lengths) + for i in range(len(feats)): + feats[i] = paddle.nn.functional.pad( + feats[i], [0, max_length - feats[i].shape[0], 0, 0], + data_format='NLC') + + return keys, paddle.stack(feats), paddle.to_tensor( + labels), paddle.to_tensor(lengths) diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py index 19a3fe14..91b02ff6 100644 --- a/paddlespeech/kws/exps/mdtc/compute_det.py +++ b/paddlespeech/kws/exps/mdtc/compute_det.py @@ -11,15 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json +# Modified from wekws(https://github.com/wenet-e2e/wekws) +import argparse import os -import sys +import yaml from tqdm import tqdm +from paddlespeech.s2t.utils.dynamic_import import dynamic_import -def load_label_and_score(keyword, label_file, score_file): - # score_table: {uttid: [keywordlist]} +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--cfg_path", type=str, required=True) +parser.add_argument('--keyword', type=int, default=0, help='keyword label') +parser.add_argument('--step', type=float, default=0.01, help='threshold step') +parser.add_argument('--window_shift', type=int, default=50, help='window_shift is used to skip the frames after triggered') +args = parser.parse_args() +# yapf: enable + + +def load_label_and_score(keyword, ds, score_file): score_table = {} with open(score_file, 'r', encoding='utf8') as fin: for line in fin: @@ -34,59 +45,40 @@ def load_label_and_score(keyword, label_file, score_file): keyword_table = {} filler_table = {} filler_duration = 0.0 - with open(label_file, 'r', encoding='utf8') as fin: - for line in fin: - obj = json.loads(line.strip()) - assert 'key' in obj - assert 'txt' in obj - assert 'duration' in obj - key = obj['key'] - index = obj['txt'] - duration = obj['duration'] - assert key in score_table - if index == keyword: - keyword_table[key] = score_table[key] - else: - filler_table[key] = score_table[key] - filler_duration += duration + + for key, index, duration in zip(ds.keys, ds.labels, ds.durations): + assert key in score_table + if index == keyword: + keyword_table[key] = score_table[key] + else: + filler_table[key] = score_table[key] + filler_duration += duration + return keyword_table, filler_table, filler_duration -class Args: - def __init__(self): - self.test_data = '/ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddlespeech/kws/models/data/test/data.list' - self.keyword = 0 - self.score_file = os.path.join( - os.path.abspath(sys.argv[1]), 'score.txt') - self.stats_file = os.path.join( - os.path.abspath(sys.argv[1]), 'stats.0.txt') - self.step = 0.01 - self.window_shift = 50 +if __name__ == '__main__': + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + data_conf = config['data'] + feat_conf = config['feature'] + scoring_conf = config['scoring'] -args = Args() + # Dataset + ds_class = dynamic_import(data_conf['dataset']) + test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf) -if __name__ == '__main__': - # parser = argparse.ArgumentParser(description='compute det curve') - # parser.add_argument('--test_data', required=True, help='label file') - # parser.add_argument('--keyword', type=int, default=0, help='keyword label') - # parser.add_argument('--score_file', required=True, help='score file') - # parser.add_argument('--step', type=float, default=0.01, - # help='threshold step') - # parser.add_argument('--window_shift', type=int, default=50, - # help='window_shift is used to skip the frames after triggered') - # parser.add_argument('--stats_file', - # required=True, - # help='false reject/alarm stats file') - # args = parser.parse_args() + score_file = os.path.abspath(scoring_conf['score_file']) + stats_file = os.path.abspath(scoring_conf['stats_file']) - window_shift = args.window_shift keyword_table, filler_table, filler_duration = load_label_and_score( - args.keyword, args.test_data, args.score_file) + args.keyword, test_ds, score_file) print('Filler total duration Hours: {}'.format(filler_duration / 3600.0)) pbar = tqdm(total=int(1.0 / args.step)) - with open(args.stats_file, 'w', encoding='utf8') as fout: - keyword_index = int(args.keyword) + with open(stats_file, 'w', encoding='utf8') as fout: + keyword_index = args.keyword threshold = 0.0 while threshold <= 1.0: num_false_reject = 0 @@ -103,7 +95,7 @@ if __name__ == '__main__': while i < len(score_list): if score_list[i] >= threshold: num_false_alarm += 1 - i += window_shift + i += args.window_shift else: i += 1 if len(keyword_table) != 0: @@ -118,4 +110,4 @@ if __name__ == '__main__': pbar.update(1) pbar.close() - print('DET saved to: {}'.format(args.stats_file)) + print('DET saved to: {}'.format(stats_file)) diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py index 7986574f..ac920358 100644 --- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py +++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py @@ -11,11 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# Modified from wekws(https://github.com/wenet-e2e/wekws) +import argparse import os -import sys import matplotlib.pyplot as plt import numpy as np +import yaml + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--cfg_path", type=str, required=True) +parser.add_argument("--keyword", type=str, required=True) +args = parser.parse_args() +# yapf: enable def load_stats_file(stats_file): @@ -29,7 +38,7 @@ def load_stats_file(stats_file): return np.array(values) -def plot_det_curve(keywords, stats_dir, figure_file, xlim, x_step, ylim, +def plot_det_curve(keywords, stats_file, figure_file, xlim, x_step, ylim, y_step): plt.figure(dpi=200) plt.rcParams['xtick.direction'] = 'in' @@ -37,7 +46,6 @@ def plot_det_curve(keywords, stats_dir, figure_file, xlim, x_step, ylim, plt.rcParams['font.size'] = 12 for index, keyword in enumerate(keywords): - stats_file = os.path.join(stats_dir, 'stats.' + str(index) + '.txt') values = load_stats_file(stats_file) plt.plot(values[:, 0], values[:, 1], label=keyword) @@ -53,11 +61,14 @@ def plot_det_curve(keywords, stats_dir, figure_file, xlim, x_step, ylim, if __name__ == '__main__': + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) - keywords = ['Hey_Snips'] - img_path = os.path.join(os.path.abspath(sys.argv[1]), 'det.png') - - plot_det_curve(keywords, - os.path.abspath(sys.argv[1]), img_path, 10, 2, 10, 2) + scoring_conf = config['scoring'] + img_file = os.path.abspath(scoring_conf['img_file']) + stats_file = os.path.abspath(scoring_conf['stats_file']) + keywords = [args.keyword] + plot_det_curve(keywords, stats_file, img_file, 10, 2, 10, 2) - print('DET curve image saved to: {}'.format(img_path)) + print('DET curve image saved to: {}'.format(img_file)) diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py index 9fdbcf49..7fe88ea3 100644 --- a/paddlespeech/kws/exps/mdtc/score.py +++ b/paddlespeech/kws/exps/mdtc/score.py @@ -11,80 +11,56 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# Modified from wekws(https://github.com/wenet-e2e/wekws) +import argparse import os -import sys -import time import paddle -from mdtc import KWSModel -from mdtc import MDTC +import yaml from tqdm import tqdm -from paddleaudio.datasets import HeySnips +from paddlespeech.kws.exps.mdtc.collate import collate_features +from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--cfg_path", type=str, required=True) +args = parser.parse_args() +# yapf: enable -def collate_features(batch): - # (key, feat, label) in one sample - collate_start = time.time() - keys = [] - feats = [] - labels = [] - lengths = [] - for sample in batch: - keys.append(sample[0]) - feats.append(sample[1]) - labels.append(sample[2]) - lengths.append(sample[1].shape[0]) - - max_length = max(lengths) - for i in range(len(feats)): - feats[i] = paddle.nn.functional.pad( - feats[i], [0, max_length - feats[i].shape[0], 0, 0], - data_format='NLC') - - return keys, paddle.stack(feats), paddle.to_tensor( - labels), paddle.to_tensor(lengths) +if __name__ == '__main__': + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + scoring_conf = config['scoring'] -if __name__ == '__main__': # Dataset - feat_conf = { - # 'n_mfcc': 80, - 'n_mels': 80, - 'frame_shift': 10, - 'frame_length': 25, - # 'dither': 1.0, - } - test_ds = HeySnips( - mode='test', feat_type='kaldi_fbank', sample_rate=16000, **feat_conf) + ds_class = dynamic_import(data_conf['dataset']) + test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf) test_sampler = paddle.io.BatchSampler( - test_ds, batch_size=32, drop_last=False) + test_ds, batch_size=scoring_conf['batch_size'], drop_last=False) test_loader = paddle.io.DataLoader( test_ds, batch_sampler=test_sampler, - num_workers=16, + num_workers=scoring_conf['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) # Model - backbone = MDTC( - stack_num=3, - stack_size=4, - in_channels=80, - res_channels=32, - kernel_size=5, - causal=True, ) - model = KWSModel(backbone=backbone, num_keywords=1) - model = paddle.DataParallel(model) - # kws_checkpoint = '/ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddlespeech/kws/models/checkpoint/epoch_10_0.8903940343290826/model.pdparams' - kws_checkpoint = os.path.join( - os.path.abspath(sys.argv[1]), 'model.pdparams') - model.set_state_dict(paddle.load(kws_checkpoint)) + backbone_class = dynamic_import(model_conf['backbone']) + backbone = backbone_class(**model_conf['config']) + model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords']) + model.set_state_dict(paddle.load(scoring_conf['checkpoint'])) model.eval() - score_abs_path = os.path.join(os.path.abspath(sys.argv[1]), 'score.txt') - with paddle.no_grad(), open(score_abs_path, 'w', encoding='utf8') as fout: + with paddle.no_grad(), open( + scoring_conf['score_file'], 'w', encoding='utf8') as fout: for batch_idx, batch in enumerate( tqdm(test_loader, total=len(test_loader))): keys, feats, labels, lengths = batch @@ -100,4 +76,4 @@ if __name__ == '__main__': fout.write( '{} {} {}\n'.format(key, keyword_i, score_frames)) - print('Scores saved to: {}'.format(score_abs_path)) + print('Result saved to: {}'.format(scoring_conf['score_file'])) diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index 17a9acfc..99e72871 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -11,77 +11,47 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import argparse import os -import time import paddle -from loss import max_pooling_loss -from mdtc import KWSModel -from mdtc import MDTC +import yaml -from paddleaudio.datasets import HeySnips from paddleaudio.utils import logger from paddleaudio.utils import Timer +from paddlespeech.kws.exps.mdtc.collate import collate_features +from paddlespeech.kws.models.loss import max_pooling_loss +from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--cfg_path", type=str, required=True) +args = parser.parse_args() +# yapf: enable -def collate_features(batch): - # (key, feat, label) - collate_start = time.time() - keys = [] - feats = [] - labels = [] - lengths = [] - for sample in batch: - keys.append(sample[0]) - feats.append(sample[1]) - labels.append(sample[2]) - lengths.append(sample[1].shape[0]) - - max_length = max(lengths) - for i in range(len(feats)): - feats[i] = paddle.nn.functional.pad( - feats[i], [0, max_length - feats[i].shape[0], 0, 0], - data_format='NLC') +if __name__ == '__main__': + nranks = paddle.distributed.get_world_size() + if paddle.distributed.get_world_size() > 1: + paddle.distributed.init_parallel_env() + local_rank = paddle.distributed.get_rank() - return keys, paddle.stack(feats), paddle.to_tensor( - labels), paddle.to_tensor(lengths) + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + training_conf = config['training'] -if __name__ == '__main__': # Dataset - feat_conf = { - # 'n_mfcc': 80, - 'n_mels': 80, - 'frame_shift': 10, - 'frame_length': 25, - # 'dither': 1.0, - } - data_dir = '/ssd1/chenxiaojie06/datasets/hey_snips/hey_snips_research_6k_en_train_eval_clean_ter' - train_ds = HeySnips( - data_dir=data_dir, - mode='train', - feat_type='kaldi_fbank', - sample_rate=16000, - **feat_conf) - dev_ds = HeySnips( - data_dir=data_dir, - mode='dev', - feat_type='kaldi_fbank', - sample_rate=16000, - **feat_conf) - - training_conf = { - 'epochs': 100, - 'learning_rate': 0.001, - 'weight_decay': 0.00005, - 'num_workers': 16, - 'batch_size': 100, - 'checkpoint_dir': './checkpoint', - 'save_freq': 10, - 'log_freq': 10, - } - - train_sampler = paddle.io.BatchSampler( + ds_class = dynamic_import(data_conf['dataset']) + train_ds = ds_class( + data_dir=data_conf['data_dir'], mode='train', **feat_conf) + dev_ds = ds_class(data_dir=data_conf['data_dir'], mode='dev', **feat_conf) + + train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=training_conf['batch_size'], shuffle=True, @@ -95,16 +65,11 @@ if __name__ == '__main__': collate_fn=collate_features, ) # Model - backbone = MDTC( - stack_num=3, - stack_size=4, - in_channels=80, - res_channels=32, - kernel_size=5, - causal=True, ) - model = KWSModel(backbone=backbone, num_keywords=1) + backbone_class = dynamic_import(model_conf['backbone']) + backbone = backbone_class(**model_conf['config']) + model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords']) model = paddle.DataParallel(model) - clip = paddle.nn.ClipGradByGlobalNorm(5.0) + clip = paddle.nn.ClipGradByGlobalNorm(training_conf['grad_clip']) optimizer = paddle.optimizer.Adam( learning_rate=training_conf['learning_rate'], weight_decay=training_conf['weight_decay'], @@ -122,9 +87,7 @@ if __name__ == '__main__': avg_loss = 0 num_corrects = 0 num_samples = 0 - batch_start = time.time() for batch_idx, batch in enumerate(train_loader): - # print('Fetch one batch: {:.4f}'.format(time.time()-batch_start)) keys, feats, labels, lengths = batch logits = model(feats) loss, corrects, acc = criterion(logits, labels, lengths) @@ -144,7 +107,8 @@ if __name__ == '__main__': timer.count() - if (batch_idx + 1) % training_conf['log_freq'] == 0: + if (batch_idx + 1 + ) % training_conf['log_freq'] == 0 and local_rank == 0: lr = optimizer.get_lr() avg_loss /= training_conf['log_freq'] avg_acc = num_corrects / num_samples @@ -161,10 +125,9 @@ if __name__ == '__main__': avg_loss = 0 num_corrects = 0 num_samples = 0 - batch_start = time.time() if epoch % training_conf[ - 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch: + 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: dev_sampler = paddle.io.BatchSampler( dev_ds, batch_size=training_conf['batch_size'], @@ -197,7 +160,7 @@ if __name__ == '__main__': # Save model save_dir = os.path.join(training_conf['checkpoint_dir'], - 'epoch_{}_{:.4f}'.format(epoch, eval_acc)) + 'epoch_{}'.format(epoch)) logger.info('Saving model checkpoint to {}'.format(save_dir)) paddle.save(model.state_dict(), os.path.join(save_dir, 'model.pdparams')) diff --git a/paddlespeech/kws/models/__init__.py b/paddlespeech/kws/models/__init__.py index 97043fd7..125a0d7a 100644 --- a/paddlespeech/kws/models/__init__.py +++ b/paddlespeech/kws/models/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .mdtc import KWSModel +from .mdtc import MDTC diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py new file mode 100644 index 00000000..8a2e9e74 --- /dev/null +++ b/paddlespeech/kws/models/loss.py @@ -0,0 +1,80 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from wekws(https://github.com/wenet-e2e/wekws) +import paddle + + +def fill_mask_elements(condition, value, x): + assert condition.shape == x.shape + values = paddle.ones_like(x, dtype=x.dtype) * value + return paddle.where(condition, values, x) + + +def max_pooling_loss(logits: paddle.Tensor, + target: paddle.Tensor, + lengths: paddle.Tensor, + min_duration: int=0): + + mask = padding_mask(lengths) + num_utts = logits.shape[0] + num_keywords = logits.shape[2] + + loss = 0.0 + for i in range(num_utts): + for j in range(num_keywords): + # Add entropy loss CE = -(t * log(p) + (1 - t) * log(1 - p)) + if target[i] == j: + # For the keyword, do max-polling + prob = logits[i, :, j] + m = mask[i] + if min_duration > 0: + m[:min_duration] = True + prob = fill_mask_elements(m, 0.0, prob) + prob = paddle.clip(prob, 1e-8, 1.0) + max_prob = prob.max() + loss += -paddle.log(max_prob) + else: + # For other keywords or filler, do min-polling + prob = 1 - logits[i, :, j] + prob = fill_mask_elements(mask[i], 1.0, prob) + prob = paddle.clip(prob, 1e-8, 1.0) + min_prob = prob.min() + loss += -paddle.log(min_prob) + loss = loss / num_utts + + # Compute accuracy of current batch + mask = mask.unsqueeze(-1) + logits = fill_mask_elements(mask, 0.0, logits) + max_logits = logits.max(1) + num_correct = 0 + for i in range(num_utts): + max_p = max_logits[i].max(0).item() + idx = max_logits[i].argmax(0).item() + # Predict correct as the i'th keyword + if max_p > 0.5 and idx == target[i].item(): + num_correct += 1 + # Predict correct as the filler, filler id < 0 + if max_p < 0.5 and target[i].item() < 0: + num_correct += 1 + acc = num_correct / num_utts + # acc = 0.0 + return loss, num_correct, acc + + +def padding_mask(lengths: paddle.Tensor) -> paddle.Tensor: + batch_size = lengths.shape[0] + max_len = int(lengths.max().item()) + seq = paddle.arange(max_len, dtype=paddle.int64) + seq = seq.expand((batch_size, max_len)) + return seq >= lengths.unsqueeze(1) diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py index 2cb14305..5d2e5de6 100644 --- a/paddlespeech/kws/models/mdtc.py +++ b/paddlespeech/kws/models/mdtc.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# Modified from wekws(https://github.com/wenet-e2e/wekws) import paddle import paddle.nn as nn import paddle.nn.functional as F @@ -163,7 +164,7 @@ class MDTC(nn.Layer): in_channels: int, res_channels: int, kernel_size: int, - causal: bool, ): + causal: bool=True, ): super(MDTC, self).__init__() assert kernel_size % 2 == 1 self.kernel_size = kernel_size @@ -230,17 +231,3 @@ class KWSModel(nn.Layer): outputs = self.backbone(x) outputs = self.linear(outputs) return self.activation(outputs) - - -if __name__ == '__main__': - paddle.set_device('cpu') - from paddleaudio.features import LogMelSpectrogram - mdtc = MDTC(3, 4, 80, 32, 5, causal=True) - - x = paddle.randn(shape=(32, 16000 * 5)) - feature_extractor = LogMelSpectrogram(sr=16000, n_fft=512, n_mels=80) - feats = feature_extractor(x).transpose([0, 2, 1]) - print(feats.shape) - - res, _ = mdtc(feats) - print(res.shape) From 43659b9882dfde6aa11ece88189a26a86479cc04 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Tue, 19 Apr 2022 18:31:05 +0800 Subject: [PATCH 5/6] Add KWS example. --- examples/hey_snips/README.md | 24 +++++------------------- examples/hey_snips/RESULTS.md | 8 -------- examples/hey_snips/kws0/README.md | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 27 deletions(-) delete mode 100644 examples/hey_snips/RESULTS.md create mode 100644 examples/hey_snips/kws0/README.md diff --git a/examples/hey_snips/README.md b/examples/hey_snips/README.md index be8d142b..ba263906 100644 --- a/examples/hey_snips/README.md +++ b/examples/hey_snips/README.md @@ -1,22 +1,8 @@ -# MDTC Keyword Spotting with HeySnips Dataset -## Dataset +## Metrics -Before running scripts, you **MUST** follow this instruction to download the dataset: https://github.com/sonos/keyword-spotting-research-datasets +We mesure FRRs with fixing false alarms in one hour: -After you download and decompress the dataset archive, you should **REPLACE** the value of `data_dir` in `conf/*.yaml` to complete dataset config. - -## Get Started - -In this section, we will train the [MDTC](https://arxiv.org/pdf/2102.13552.pdf) model and evaluate on "Hey Snips" dataset. - -```sh -CUDA_VISIBLE_DEVICES=0,1 ./run.sh conf/mdtc.yaml -``` - -This script contains training and scoring steps. You can just set the `CUDA_VISIBLE_DEVICES` environment var to run on single gpu or multi-gpus. - -The vars `stage` and `stop_stage` in `./run.sh` controls the running steps: -- stage 1: Training from scratch. -- stage 2: Evaluating model on test dataset and computing detection error tradeoff(DET) of all trigger thresholds. -- stage 3: Plotting the DET cruve for visualizaiton. +|Model|False Alarm| False Reject Rate| +|--|--|--| +|MDTC| 1| 0.003559 | diff --git a/examples/hey_snips/RESULTS.md b/examples/hey_snips/RESULTS.md deleted file mode 100644 index ba263906..00000000 --- a/examples/hey_snips/RESULTS.md +++ /dev/null @@ -1,8 +0,0 @@ - -## Metrics - -We mesure FRRs with fixing false alarms in one hour: - -|Model|False Alarm| False Reject Rate| -|--|--|--| -|MDTC| 1| 0.003559 | diff --git a/examples/hey_snips/kws0/README.md b/examples/hey_snips/kws0/README.md new file mode 100644 index 00000000..be8d142b --- /dev/null +++ b/examples/hey_snips/kws0/README.md @@ -0,0 +1,22 @@ +# MDTC Keyword Spotting with HeySnips Dataset + +## Dataset + +Before running scripts, you **MUST** follow this instruction to download the dataset: https://github.com/sonos/keyword-spotting-research-datasets + +After you download and decompress the dataset archive, you should **REPLACE** the value of `data_dir` in `conf/*.yaml` to complete dataset config. + +## Get Started + +In this section, we will train the [MDTC](https://arxiv.org/pdf/2102.13552.pdf) model and evaluate on "Hey Snips" dataset. + +```sh +CUDA_VISIBLE_DEVICES=0,1 ./run.sh conf/mdtc.yaml +``` + +This script contains training and scoring steps. You can just set the `CUDA_VISIBLE_DEVICES` environment var to run on single gpu or multi-gpus. + +The vars `stage` and `stop_stage` in `./run.sh` controls the running steps: +- stage 1: Training from scratch. +- stage 2: Evaluating model on test dataset and computing detection error tradeoff(DET) of all trigger thresholds. +- stage 3: Plotting the DET cruve for visualizaiton. From caa8eb4d0df409d1be316aa92cfe8533dd1800e7 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Sun, 24 Apr 2022 23:50:44 +0800 Subject: [PATCH 6/6] Add KWS example. --- examples/hey_snips/kws0/local/train.sh | 1 + examples/hey_snips/kws0/run.sh | 5 +++++ paddlespeech/kws/exps/mdtc/compute_det.py | 21 ++++++++++++--------- paddlespeech/kws/models/loss.py | 19 ++++++++++--------- 4 files changed, 28 insertions(+), 18 deletions(-) diff --git a/examples/hey_snips/kws0/local/train.sh b/examples/hey_snips/kws0/local/train.sh index cab547b8..8d0181b8 100755 --- a/examples/hey_snips/kws0/local/train.sh +++ b/examples/hey_snips/kws0/local/train.sh @@ -7,6 +7,7 @@ if [ ${ngpu} -gt 0 ]; then python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ --cfg_path ${cfg_path} else + echo "set CUDA_VISIBLE_DEVICES to enable multi-gpus trainning." python3 ${BIN_DIR}/train.py \ --cfg_path ${cfg_path} fi diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh index d6d1d878..2cc09a4f 100755 --- a/examples/hey_snips/kws0/run.sh +++ b/examples/hey_snips/kws0/run.sh @@ -18,6 +18,11 @@ source path.sh ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ $# != 1 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path" + exit -1 +fi + stage=1 stop_stage=3 diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py index 91b02ff6..817846b8 100644 --- a/paddlespeech/kws/exps/mdtc/compute_det.py +++ b/paddlespeech/kws/exps/mdtc/compute_det.py @@ -15,6 +15,7 @@ import argparse import os +import paddle import yaml from tqdm import tqdm @@ -23,32 +24,34 @@ from paddlespeech.s2t.utils.dynamic_import import dynamic_import # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--cfg_path", type=str, required=True) -parser.add_argument('--keyword', type=int, default=0, help='keyword label') -parser.add_argument('--step', type=float, default=0.01, help='threshold step') +parser.add_argument('--keyword_index', type=int, default=0, help='keyword index') +parser.add_argument('--step', type=float, default=0.01, help='threshold step of trigger score') parser.add_argument('--window_shift', type=int, default=50, help='window_shift is used to skip the frames after triggered') args = parser.parse_args() # yapf: enable -def load_label_and_score(keyword, ds, score_file): - score_table = {} +def load_label_and_score(keyword_index: int, + ds: paddle.io.Dataset, + score_file: os.PathLike): + score_table = {} # {utt_id: scores_over_frames} with open(score_file, 'r', encoding='utf8') as fin: for line in fin: arr = line.strip().split() key = arr[0] current_keyword = arr[1] str_list = arr[2:] - if int(current_keyword) == keyword: + if int(current_keyword) == keyword_index: scores = list(map(float, str_list)) if key not in score_table: score_table.update({key: scores}) - keyword_table = {} - filler_table = {} + keyword_table = {} # scores of keyword utt_id + filler_table = {} # scores of non-keyword utt_id filler_duration = 0.0 for key, index, duration in zip(ds.keys, ds.labels, ds.durations): assert key in score_table - if index == keyword: + if index == keyword_index: keyword_table[key] = score_table[key] else: filler_table[key] = score_table[key] @@ -78,7 +81,7 @@ if __name__ == '__main__': print('Filler total duration Hours: {}'.format(filler_duration / 3600.0)) pbar = tqdm(total=int(1.0 / args.step)) with open(stats_file, 'w', encoding='utf8') as fout: - keyword_index = args.keyword + keyword_index = args.keyword_index threshold = 0.0 while threshold <= 1.0: num_false_reject = 0 diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py index 8a2e9e74..64c9a32c 100644 --- a/paddlespeech/kws/models/loss.py +++ b/paddlespeech/kws/models/loss.py @@ -15,7 +15,16 @@ import paddle -def fill_mask_elements(condition, value, x): +def padding_mask(lengths: paddle.Tensor) -> paddle.Tensor: + batch_size = lengths.shape[0] + max_len = int(lengths.max().item()) + seq = paddle.arange(max_len, dtype=paddle.int64) + seq = seq.expand((batch_size, max_len)) + return seq >= lengths.unsqueeze(1) + + +def fill_mask_elements(condition: paddle.Tensor, value: float, + x: paddle.Tensor) -> paddle.Tensor: assert condition.shape == x.shape values = paddle.ones_like(x, dtype=x.dtype) * value return paddle.where(condition, values, x) @@ -70,11 +79,3 @@ def max_pooling_loss(logits: paddle.Tensor, acc = num_correct / num_utts # acc = 0.0 return loss, num_correct, acc - - -def padding_mask(lengths: paddle.Tensor) -> paddle.Tensor: - batch_size = lengths.shape[0] - max_len = int(lengths.max().item()) - seq = paddle.arange(max_len, dtype=paddle.int64) - seq = seq.expand((batch_size, max_len)) - return seq >= lengths.unsqueeze(1)