add kaldi feature pitch

pull/2119/head
Yang Zhou 3 years ago
parent a16a044fef
commit 7ae36b219d

@ -65,4 +65,4 @@ add_subdirectory(paddlespeech/audio)
# Summary
include(cmake/summary.cmake)
onnx_print_configuration_summary()
onnx_print_configuration_summary()

@ -0,0 +1,15 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import fbank
from . import pitch

@ -0,0 +1,103 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlespeech.audio._internal import module_utils
import paddlespeech.audio.ops.paddleaudio.ComputeFbank as ComputeFbank
import paddlespeech.audio.ops.paddleaudio.ComputeKaldiPitch as ComputeKaldiPitch
__all__ = [
'fbank',
'pitch',
]
@module_utils.requires_kaldi()
def fbank(wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
dither: float=0.0,
preemph_coeff: float=0.97,
remove_dc_offset: bool=True,
window_type: str='povey',
round_to_power_of_two: bool=True,
blackman_coeff: float=0.42,
snip_edges: bool=True,
allow_downsample: bool=False,
allow_upsample: bool=False,
max_feature_vectors: int=-1,
num_bins: int=23,
low_freq: float=20,
high_freq: float=0,
vtln_low: float=100,
vtln_high: float=-500,
debug_mel: bool=False,
htk_mode: bool=False,
use_energy: bool=False, # fbank opts
energy_floor: float=0.0,
raw_energy: bool=True,
htk_compat: bool=False,
use_log_fbank: bool=True,
use_power: bool=True):
feat = ComputeFbank(
samp_freq, frame_shift_ms, frame_length_ms,
dither, preemph_coeff, remove_dc_offset,
window_type, round_to_power_of_two, blackman_coeff,
snip_edges, allow_downsample, allow_upsample,
max_feature_vectors, num_bins, low_freq,
high_freq, vtln_low, vtln_high, debug_mel,
htk_mode, use_energy, energy_floor,
raw_energy, htk_compat, use_log_fbank, use_power, wav)
return feat
@module_utils.requires_kaldi()
def pitch(wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
preemph_coeff: float=0.0,
min_f0: int=50,
max_f0: int=400,
soft_min_f0: float=10.0,
penalty_factor: float=0.1,
lowpass_cutoff: int=1000,
resample_freq: int=4000,
delta_pitch: float=0.005,
nccf_ballast: int=7000,
lowpass_filter_width: int=1,
upsample_filter_width: int=5,
max_frames_latency: int=0,
frames_per_chunk: int=0,
simulate_first_pass_online: bool=False,
recompute_frame: int=500,
nccf_ballast_online: bool=False,
snip_edges: bool=True):
pitch = ComputeKaldiPitch(samp_freq, frame_shift_ms,
frame_length_ms,
preemph_coeff,
min_f0,
max_f0,
soft_min_f0,
penalty_factor,
lowpass_cutoff,
resample_freq,
delta_pitch,
nccf_ballast,
lowpass_filter_width,
upsample_filter_width,
max_frames_latency,
frames_per_chunk,
simulate_first_pass_online,
recompute_frame,
nccf_ballast_online,
snip_edges, wav)
return pitch

@ -191,15 +191,15 @@ opts.preemph_coeff = preemph_coeff;
opts.snip_edges = snip_edges;
py::buffer_info info = wav.request();
kaldi::Vector<::kaldi::BaseFloat> input_wav(info.size);
::kaldi::Vector<::kaldi::BaseFloat> input_wav(info.size);
double* wav_ptr = (double*)info.ptr;
for (int idx = 0; idx < info.size; ++idx) {
input_wav(idx) = *wav_ptr;
wav_ptr++;
}
kaldi::Matrix<kaldi::BaseFloat> features;
kaldi::ComputeKaldiPitch(opts, input_wav, &features);
::kaldi::Matrix<::kaldi::BaseFloat> features;
::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
auto result = py::array_t<double>({features.NumRows(), features.NumCols()});
for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
for (int col_idx = 0; col_idx < features.NumCols(); ++col_idx) {

@ -4,15 +4,15 @@
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
#include "paddlespeech/audio/src/pybind/sox/io.h"
// Sox
PYBIND11_MODULE(_paddleaudio, m) {
// Sox
m.def("get_info_file",
&paddleaudio::sox_io::get_info_file,
"Get metadata of audio file.");
m.def("get_info_fileobj",
&paddleaudio::sox_io::get_info_fileobj,
"Get metadata of audio in file object.");
// kaldi feat
m.def("InitFbank", &paddleaudio::kaldi::InitFbank, "init fbank");
m.def("ResetFbank", &paddleaudio::kaldi::ResetFbank, "reset fbank");
m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");

@ -70,6 +70,7 @@ add_library(kaldi-feat-common STATIC
feat/feature-functions.cc
feat/feature-window.cc
feat/resample.cc
feat/pitch-functions.cc
feat/mel-computations.cc
feat/cmvn.cc
)
@ -113,4 +114,4 @@ target_link_libraries(libkaldi INTERFACE
gfortran
-Wl,--no-whole-archive -Wl,--end-group
)
target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")

@ -27,7 +27,7 @@
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
#include "itf/options-itf.h"
#include "util/options-itf.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction

@ -0,0 +1,125 @@
// itf/online-feature-itf.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_ITF_ONLINE_FEATURE_ITF_H_
#define KALDI_ITF_ONLINE_FEATURE_ITF_H_ 1
#include "base/kaldi-common.h"
#include "matrix/matrix-lib.h"
namespace kaldi {
/// @ingroup Interfaces
/// @{
/**
OnlineFeatureInterface is an interface for online feature processing (it is
also usable in the offline setting, but currently we're not using it for
that). This is for use in the online2/ directory, and it supersedes the
interface in ../online/online-feat-input.h. We have a slightly different
model that puts more control in the hands of the calling thread, and won't
involve waiting on semaphores in the decoding thread.
This interface only specifies how the object *outputs* the features.
How it obtains the features, e.g. from a previous object or objects of type
OnlineFeatureInterface, is not specified in the interface and you will
likely define new constructors or methods in the derived type to do that.
You should appreciate that this interface is designed to allow random
access to features, as long as they are ready. That is, the user
can call GetFrame for any frame less than NumFramesReady(), and when
implementing a child class you must not make assumptions about the
order in which the user makes these calls.
*/
class OnlineFeatureInterface {
public:
virtual int32 Dim() const = 0; /// returns the feature dimension.
/// Returns the total number of frames, since the start of the utterance, that
/// are now available. In an online-decoding context, this will likely
/// increase with time as more data becomes available.
virtual int32 NumFramesReady() const = 0;
/// Returns true if this is the last frame. Frame indices are zero-based, so the
/// first frame is zero. IsLastFrame(-1) will return false, unless the file
/// is empty (which is a case that I'm not sure all the code will handle, so
/// be careful). This function may return false for some frame if
/// we haven't yet decided to terminate decoding, but later true if we decide
/// to terminate decoding. This function exists mainly to correctly handle
/// end effects in feature extraction, and is not a mechanism to determine how
/// many frames are in the decodable object (as it used to be, and for backward
/// compatibility, still is, in the Decodable interface).
virtual bool IsLastFrame(int32 frame) const = 0;
/// Gets the feature vector for this frame. Before calling this for a given
/// frame, it is assumed that you called NumFramesReady() and it returned a
/// number greater than "frame". Otherwise this call will likely crash with
/// an assert failure. This function is not declared const, in case there is
/// some kind of caching going on, but most of the time it shouldn't modify
/// the class.
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
/// This is like GetFrame() but for a collection of frames. There is a
/// default implementation that just gets the frames one by one, but it
/// may be overridden for efficiency by child classes (since sometimes
/// it's more efficient to do things in a batch).
virtual void GetFrames(const std::vector<int32> &frames,
MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
for (size_t i = 0; i < frames.size(); i++) {
SubVector<BaseFloat> feat(*feats, i);
GetFrame(frames[i], &feat);
}
}
// Returns frame shift in seconds. Helps to estimate duration from frame
// counts.
virtual BaseFloat FrameShiftInSeconds() const = 0;
/// Virtual destructor. Note: constructors that take another member of
/// type OnlineFeatureInterface are not expected to take ownership of
/// that pointer; the caller needs to keep track of that manually.
virtual ~OnlineFeatureInterface() { }
};
/// Add a virtual class for "source" features such as MFCC or PLP or pitch
/// features.
class OnlineBaseFeature: public OnlineFeatureInterface {
public:
/// This would be called from the application, when you get more wave data.
/// Note: the sampling_rate is typically only provided so the code can assert
/// that it matches the sampling rate expected in the options.
virtual void AcceptWaveform(BaseFloat sampling_rate,
const VectorBase<BaseFloat> &waveform) = 0;
/// InputFinished() tells the class you won't be providing any
/// more waveform. This will help flush out the last few frames
/// of delta or LDA features (it will typically affect the return value
/// of IsLastFrame.
virtual void InputFinished() = 0;
};
/// @}
} // namespace Kaldi
#endif // KALDI_ITF_ONLINE_FEATURE_ITF_H_

@ -34,7 +34,7 @@
#include "feat/feature-mfcc.h"
#include "feat/feature-plp.h"
#include "feat/feature-fbank.h"
#include "itf/online-feature-itf.h"
#include "feat/online-feature-itf.h"
namespace kaldi {
/// @addtogroup onlinefeat OnlineFeatureExtraction

@ -31,7 +31,7 @@
#include "base/kaldi-error.h"
#include "feat/mel-computations.h"
#include "itf/online-feature-itf.h"
#include "feat/online-feature-itf.h"
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"

Loading…
Cancel
Save