From 7ae36b219d737c2cd20acb9cae766632234384ea Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 7 Jul 2022 11:21:03 +0800 Subject: [PATCH] add kaldi feature pitch --- CMakeLists.txt | 2 +- paddlespeech/audio/kaldi/__init__.py | 15 +++ paddlespeech/audio/kaldi/kaldi.py | 103 +++++++++++++++ .../audio/src/pybind/kaldi/kaldi_feature.cc | 6 +- paddlespeech/audio/src/pybind/pybind.cpp | 4 +- .../audio/third_party/kaldi/CMakeLists.txt | 3 +- speechx/speechx/kaldi/feat/feature-plp.h | 2 +- .../speechx/kaldi/feat/online-feature-itf.h | 125 ++++++++++++++++++ speechx/speechx/kaldi/feat/online-feature.h | 2 +- speechx/speechx/kaldi/feat/pitch-functions.h | 2 +- 10 files changed, 254 insertions(+), 10 deletions(-) create mode 100644 paddlespeech/audio/kaldi/__init__.py create mode 100644 paddlespeech/audio/kaldi/kaldi.py create mode 100644 speechx/speechx/kaldi/feat/online-feature-itf.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 764277d1b..0d260dd3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,4 +65,4 @@ add_subdirectory(paddlespeech/audio) # Summary include(cmake/summary.cmake) -onnx_print_configuration_summary() \ No newline at end of file +onnx_print_configuration_summary() diff --git a/paddlespeech/audio/kaldi/__init__.py b/paddlespeech/audio/kaldi/__init__.py new file mode 100644 index 000000000..2b52ad23d --- /dev/null +++ b/paddlespeech/audio/kaldi/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import fbank +from . import pitch diff --git a/paddlespeech/audio/kaldi/kaldi.py b/paddlespeech/audio/kaldi/kaldi.py new file mode 100644 index 000000000..53b82670c --- /dev/null +++ b/paddlespeech/audio/kaldi/kaldi.py @@ -0,0 +1,103 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddlespeech.audio._internal import module_utils +import paddlespeech.audio.ops.paddleaudio.ComputeFbank as ComputeFbank +import paddlespeech.audio.ops.paddleaudio.ComputeKaldiPitch as ComputeKaldiPitch + +__all__ = [ + 'fbank', + 'pitch', +] + +@module_utils.requires_kaldi() +def fbank(wav, + samp_freq: int=16000, + frame_shift_ms: float=10.0, + frame_length_ms: float=25.0, + dither: float=0.0, + preemph_coeff: float=0.97, + remove_dc_offset: bool=True, + window_type: str='povey', + round_to_power_of_two: bool=True, + blackman_coeff: float=0.42, + snip_edges: bool=True, + allow_downsample: bool=False, + allow_upsample: bool=False, + max_feature_vectors: int=-1, + num_bins: int=23, + low_freq: float=20, + high_freq: float=0, + vtln_low: float=100, + vtln_high: float=-500, + debug_mel: bool=False, + htk_mode: bool=False, + use_energy: bool=False, # fbank opts + energy_floor: float=0.0, + raw_energy: bool=True, + htk_compat: bool=False, + use_log_fbank: bool=True, + use_power: bool=True): + feat = ComputeFbank( + samp_freq, frame_shift_ms, frame_length_ms, + dither, preemph_coeff, remove_dc_offset, + window_type, round_to_power_of_two, blackman_coeff, + snip_edges, allow_downsample, allow_upsample, + max_feature_vectors, num_bins, low_freq, + high_freq, vtln_low, vtln_high, debug_mel, + htk_mode, use_energy, energy_floor, + raw_energy, htk_compat, use_log_fbank, use_power, wav) + return feat + +@module_utils.requires_kaldi() +def pitch(wav, + samp_freq: int=16000, + frame_shift_ms: float=10.0, + frame_length_ms: float=25.0, + preemph_coeff: float=0.0, + min_f0: int=50, + max_f0: int=400, + soft_min_f0: float=10.0, + penalty_factor: float=0.1, + lowpass_cutoff: int=1000, + resample_freq: int=4000, + delta_pitch: float=0.005, + nccf_ballast: int=7000, + lowpass_filter_width: int=1, + upsample_filter_width: int=5, + max_frames_latency: int=0, + frames_per_chunk: int=0, + simulate_first_pass_online: bool=False, + recompute_frame: int=500, + nccf_ballast_online: bool=False, + snip_edges: bool=True): + pitch = ComputeKaldiPitch(samp_freq, frame_shift_ms, + frame_length_ms, + preemph_coeff, + min_f0, + max_f0, + soft_min_f0, + penalty_factor, + lowpass_cutoff, + resample_freq, + delta_pitch, + nccf_ballast, + lowpass_filter_width, + upsample_filter_width, + max_frames_latency, + frames_per_chunk, + simulate_first_pass_online, + recompute_frame, + nccf_ballast_online, + snip_edges, wav) + return pitch \ No newline at end of file diff --git a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc index 58a498477..4ac9a93ce 100644 --- a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc +++ b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc @@ -191,15 +191,15 @@ opts.preemph_coeff = preemph_coeff; opts.snip_edges = snip_edges; py::buffer_info info = wav.request(); - kaldi::Vector<::kaldi::BaseFloat> input_wav(info.size); + ::kaldi::Vector<::kaldi::BaseFloat> input_wav(info.size); double* wav_ptr = (double*)info.ptr; for (int idx = 0; idx < info.size; ++idx) { input_wav(idx) = *wav_ptr; wav_ptr++; } - kaldi::Matrix features; - kaldi::ComputeKaldiPitch(opts, input_wav, &features); + ::kaldi::Matrix<::kaldi::BaseFloat> features; + ::kaldi::ComputeKaldiPitch(opts, input_wav, &features); auto result = py::array_t({features.NumRows(), features.NumCols()}); for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) { for (int col_idx = 0; col_idx < features.NumCols(); ++col_idx) { diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index e7f307ca3..d434cbda6 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -4,15 +4,15 @@ #include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h" #include "paddlespeech/audio/src/pybind/sox/io.h" -// Sox PYBIND11_MODULE(_paddleaudio, m) { +// Sox m.def("get_info_file", &paddleaudio::sox_io::get_info_file, "Get metadata of audio file."); m.def("get_info_fileobj", &paddleaudio::sox_io::get_info_fileobj, "Get metadata of audio in file object."); - +// kaldi feat m.def("InitFbank", &paddleaudio::kaldi::InitFbank, "init fbank"); m.def("ResetFbank", &paddleaudio::kaldi::ResetFbank, "reset fbank"); m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank"); diff --git a/paddlespeech/audio/third_party/kaldi/CMakeLists.txt b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt index 6934016ee..2304c59e6 100644 --- a/paddlespeech/audio/third_party/kaldi/CMakeLists.txt +++ b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt @@ -70,6 +70,7 @@ add_library(kaldi-feat-common STATIC feat/feature-functions.cc feat/feature-window.cc feat/resample.cc + feat/pitch-functions.cc feat/mel-computations.cc feat/cmvn.cc ) @@ -113,4 +114,4 @@ target_link_libraries(libkaldi INTERFACE gfortran -Wl,--no-whole-archive -Wl,--end-group ) -target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST") \ No newline at end of file +target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST") diff --git a/speechx/speechx/kaldi/feat/feature-plp.h b/speechx/speechx/kaldi/feat/feature-plp.h index 4f156ca1e..cce6ee1c4 100644 --- a/speechx/speechx/kaldi/feat/feature-plp.h +++ b/speechx/speechx/kaldi/feat/feature-plp.h @@ -27,7 +27,7 @@ #include "feat/feature-functions.h" #include "feat/feature-window.h" #include "feat/mel-computations.h" -#include "itf/options-itf.h" +#include "util/options-itf.h" namespace kaldi { /// @addtogroup feat FeatureExtraction diff --git a/speechx/speechx/kaldi/feat/online-feature-itf.h b/speechx/speechx/kaldi/feat/online-feature-itf.h new file mode 100644 index 000000000..3d139b461 --- /dev/null +++ b/speechx/speechx/kaldi/feat/online-feature-itf.h @@ -0,0 +1,125 @@ +// itf/online-feature-itf.h + +// Copyright 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_ITF_ONLINE_FEATURE_ITF_H_ +#define KALDI_ITF_ONLINE_FEATURE_ITF_H_ 1 +#include "base/kaldi-common.h" +#include "matrix/matrix-lib.h" + +namespace kaldi { +/// @ingroup Interfaces +/// @{ + +/** + OnlineFeatureInterface is an interface for online feature processing (it is + also usable in the offline setting, but currently we're not using it for + that). This is for use in the online2/ directory, and it supersedes the + interface in ../online/online-feat-input.h. We have a slightly different + model that puts more control in the hands of the calling thread, and won't + involve waiting on semaphores in the decoding thread. + + This interface only specifies how the object *outputs* the features. + How it obtains the features, e.g. from a previous object or objects of type + OnlineFeatureInterface, is not specified in the interface and you will + likely define new constructors or methods in the derived type to do that. + + You should appreciate that this interface is designed to allow random + access to features, as long as they are ready. That is, the user + can call GetFrame for any frame less than NumFramesReady(), and when + implementing a child class you must not make assumptions about the + order in which the user makes these calls. +*/ + +class OnlineFeatureInterface { + public: + virtual int32 Dim() const = 0; /// returns the feature dimension. + + /// Returns the total number of frames, since the start of the utterance, that + /// are now available. In an online-decoding context, this will likely + /// increase with time as more data becomes available. + virtual int32 NumFramesReady() const = 0; + + /// Returns true if this is the last frame. Frame indices are zero-based, so the + /// first frame is zero. IsLastFrame(-1) will return false, unless the file + /// is empty (which is a case that I'm not sure all the code will handle, so + /// be careful). This function may return false for some frame if + /// we haven't yet decided to terminate decoding, but later true if we decide + /// to terminate decoding. This function exists mainly to correctly handle + /// end effects in feature extraction, and is not a mechanism to determine how + /// many frames are in the decodable object (as it used to be, and for backward + /// compatibility, still is, in the Decodable interface). + virtual bool IsLastFrame(int32 frame) const = 0; + + /// Gets the feature vector for this frame. Before calling this for a given + /// frame, it is assumed that you called NumFramesReady() and it returned a + /// number greater than "frame". Otherwise this call will likely crash with + /// an assert failure. This function is not declared const, in case there is + /// some kind of caching going on, but most of the time it shouldn't modify + /// the class. + virtual void GetFrame(int32 frame, VectorBase *feat) = 0; + + + /// This is like GetFrame() but for a collection of frames. There is a + /// default implementation that just gets the frames one by one, but it + /// may be overridden for efficiency by child classes (since sometimes + /// it's more efficient to do things in a batch). + virtual void GetFrames(const std::vector &frames, + MatrixBase *feats) { + KALDI_ASSERT(static_cast(frames.size()) == feats->NumRows()); + for (size_t i = 0; i < frames.size(); i++) { + SubVector feat(*feats, i); + GetFrame(frames[i], &feat); + } + } + + + // Returns frame shift in seconds. Helps to estimate duration from frame + // counts. + virtual BaseFloat FrameShiftInSeconds() const = 0; + + /// Virtual destructor. Note: constructors that take another member of + /// type OnlineFeatureInterface are not expected to take ownership of + /// that pointer; the caller needs to keep track of that manually. + virtual ~OnlineFeatureInterface() { } + +}; + + +/// Add a virtual class for "source" features such as MFCC or PLP or pitch +/// features. +class OnlineBaseFeature: public OnlineFeatureInterface { + public: + /// This would be called from the application, when you get more wave data. + /// Note: the sampling_rate is typically only provided so the code can assert + /// that it matches the sampling rate expected in the options. + virtual void AcceptWaveform(BaseFloat sampling_rate, + const VectorBase &waveform) = 0; + + /// InputFinished() tells the class you won't be providing any + /// more waveform. This will help flush out the last few frames + /// of delta or LDA features (it will typically affect the return value + /// of IsLastFrame. + virtual void InputFinished() = 0; +}; + + +/// @} +} // namespace Kaldi + +#endif // KALDI_ITF_ONLINE_FEATURE_ITF_H_ diff --git a/speechx/speechx/kaldi/feat/online-feature.h b/speechx/speechx/kaldi/feat/online-feature.h index f2ebe45bf..f9b26ecc0 100644 --- a/speechx/speechx/kaldi/feat/online-feature.h +++ b/speechx/speechx/kaldi/feat/online-feature.h @@ -34,7 +34,7 @@ #include "feat/feature-mfcc.h" #include "feat/feature-plp.h" #include "feat/feature-fbank.h" -#include "itf/online-feature-itf.h" +#include "feat/online-feature-itf.h" namespace kaldi { /// @addtogroup onlinefeat OnlineFeatureExtraction diff --git a/speechx/speechx/kaldi/feat/pitch-functions.h b/speechx/speechx/kaldi/feat/pitch-functions.h index 70e85380b..9edf6c9ff 100644 --- a/speechx/speechx/kaldi/feat/pitch-functions.h +++ b/speechx/speechx/kaldi/feat/pitch-functions.h @@ -31,7 +31,7 @@ #include "base/kaldi-error.h" #include "feat/mel-computations.h" -#include "itf/online-feature-itf.h" +#include "feat/online-feature-itf.h" #include "matrix/matrix-lib.h" #include "util/common-utils.h"