add kaldi fbank to paddleaudio

pull/2051/head
Yang Zhou 3 years ago
parent 8641608f08
commit ee4d7bb402

@ -0,0 +1,37 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(paddleaudio VERSION 0.1)
# cmake dir
set(paddleaudio_cmake_dir ${PROJECT_SOURCE_DIR}/cmake)
# Modules
list(APPEND CMAKE_MODULE_PATH ${paddleaudio_cmake_dir}/external)
list(APPEND CMAKE_MODULE_PATH ${paddleaudio_cmake_dir})
include(FetchContent)
include(ExternalProject)
# fc_patch dir
set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g")
# see the thirdparty/kaldi/base/kaldi-types.h
# compile kaldi without openfst
add_definitions("-DCOMPILE_WITHOUT_OPENFST")
include(openblas)
include(pybind)
#set(fc_patch /paddle/mnt/zhouyang/wks2/PaddleSpeech/paddleaudio/fc_patch)
#set(OpenBLAS_INSTALL_PREFIX ${fc_patch}/openblas-install)
#link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
#include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/third_party/kaldi)
#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/pybind11/include)
include_directories(/usr/include/python3.7m)
add_subdirectory(third_party)
add_subdirectory(kaldi_frontend)

@ -0,0 +1,20 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
)
add_library(kaldi_feature
kaldi_feature.cc
kaldi_feature_wrapper.cc
)
target_link_libraries(kaldi_feature kaldi-fbank)
pybind11_add_module(kaldi_featurepy kaldi_feature.cc kaldi_feature_wrapper.cc)
target_link_libraries(kaldi_featurepy PRIVATE kaldi_feature)
set(bin_name kaldi_feature_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${bin_name} PUBLIC kaldi_feature python3.7m)

@ -0,0 +1,52 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "feat/feature-window.h"
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace paddleaudio {
namespace py = pybind11;
template <class F>
class StreamingFeatureTpl {
public:
typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts);
bool ComputeFeature(const kaldi::VectorBase<kaldi::BaseFloat>& wav,
kaldi::Vector<kaldi::BaseFloat>* feats);
void Reset() {
remained_wav_.Resize(0);
}
int Dim() {
return computer_.Dim();
}
private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats);
Options opts_;
kaldi::FeatureWindowFunction window_function_;
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
F computer_;
};
} // namespace ppspeech
#include "kaldi_frontend/feature_common_inl.h"

@ -0,0 +1,92 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
namespace paddleaudio {
template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(
const Options& opts)
: opts_(opts), computer_(opts),
window_function_(opts.frame_opts) {
//window_function_(computer_.GetFrameOptions()) { the opt set to zero
}
template <class F>
bool StreamingFeatureTpl<F>::ComputeFeature(const kaldi::VectorBase<kaldi::BaseFloat>& wav,
kaldi::Vector<kaldi::BaseFloat>* feats) {
// append remaned waves
kaldi::int32 wav_len = wav.Dim();
if (wav_len == 0) return false;
kaldi::int32 left_len = remained_wav_.Dim();
kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// cache remaned waves
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
kaldi::int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
kaldi::int32 frame_shift = frame_opts.WindowShift();
kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
// compute speech feature
Compute(waves, feats);
return true;
}
// Compute feat
template <class F>
bool StreamingFeatureTpl<F>::Compute(
const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats) {
kaldi::BaseFloat vtln_warp = 1.0;
const kaldi::FrameExtractionOptions& frame_opts =
computer_.GetFrameOptions();
kaldi::int32 num_samples = waves.Dim();
kaldi::int32 frame_length = frame_opts.WindowSize();
kaldi::int32 sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
return false;
}
kaldi::int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
kaldi::Vector<kaldi::BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (kaldi::int32 frame = 0; frame < num_frames; frame++) {
kaldi::BaseFloat raw_log_energy = 0.0;
kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(),
kaldi::kUndefined);
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
kaldi::SubVector<kaldi::BaseFloat> output_row(
feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
}
return true;
}
} // namespace paddleaudio

@ -0,0 +1,143 @@
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "kaldi_feature_wrapper.h"
namespace py=pybind11;
bool InitFbank(
float samp_freq, // frame opts
float frame_shift_ms,
float frame_length_ms,
float dither,
float preemph_coeff,
bool remove_dc_offset,
std::string window_type, // e.g. Hamming window
bool round_to_power_of_two,
float blackman_coeff,
bool snip_edges,
bool allow_downsample,
bool allow_upsample,
int max_feature_vectors,
int num_bins, // mel opts
float low_freq,
float high_freq,
float vtln_low,
float vtln_high,
bool debug_mel,
bool htk_mode,
bool use_energy, // fbank opts
float energy_floor,
bool raw_energy,
bool htk_compat,
bool use_log_fbank,
bool use_power) {
kaldi::FbankOptions opts;
opts.frame_opts.samp_freq = samp_freq; // frame opts
opts.frame_opts.frame_shift_ms = frame_shift_ms;
opts.frame_opts.frame_length_ms = frame_length_ms;
opts.frame_opts.dither = dither;
opts.frame_opts.preemph_coeff = preemph_coeff;
opts.frame_opts.remove_dc_offset = remove_dc_offset;
opts.frame_opts.window_type = window_type;
opts.frame_opts.round_to_power_of_two = round_to_power_of_two;
opts.frame_opts.blackman_coeff = blackman_coeff;
opts.frame_opts.snip_edges = snip_edges;
opts.frame_opts.allow_downsample = allow_downsample;
opts.frame_opts.allow_upsample = allow_upsample;
opts.frame_opts.max_feature_vectors = max_feature_vectors;
opts.mel_opts.num_bins = num_bins; // mel opts
opts.mel_opts.low_freq = low_freq;
opts.mel_opts.high_freq = high_freq;
opts.mel_opts.vtln_low = vtln_low;
opts.mel_opts.vtln_high = vtln_high;
opts.mel_opts.debug_mel = debug_mel;
opts.mel_opts.htk_mode = htk_mode;
opts.use_energy = use_energy; // fbank opts
opts.energy_floor = energy_floor;
opts.raw_energy = raw_energy;
opts.htk_compat = htk_compat;
opts.use_log_fbank = use_log_fbank;
opts.use_power = use_power;
paddleaudio::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
return true;
}
py::array_t<double> ComputeFbankStreaming(const py::array_t<double>& wav) {
return paddleaudio::KaldiFeatureWrapper::GetInstance()->ComputeFbank(wav);
}
py::array_t<double> ComputeFbank(
float samp_freq, // frame opts
float frame_shift_ms,
float frame_length_ms,
float dither,
float preemph_coeff,
bool remove_dc_offset,
std::string window_type, // e.g. Hamming window
bool round_to_power_of_two,
float blackman_coeff,
bool snip_edges,
bool allow_downsample,
bool allow_upsample,
int max_feature_vectors,
int num_bins, // mel opts
float low_freq,
float high_freq,
float vtln_low,
float vtln_high,
bool debug_mel,
bool htk_mode,
bool use_energy, // fbank opts
float energy_floor,
bool raw_energy,
bool htk_compat,
bool use_log_fbank,
bool use_power,
const py::array_t<double>& wav) {
InitFbank(samp_freq, // frame opts
frame_shift_ms,
frame_length_ms,
dither,
preemph_coeff,
remove_dc_offset,
window_type, // e.g. Hamming window
round_to_power_of_two,
blackman_coeff,
snip_edges,
allow_downsample,
allow_upsample,
max_feature_vectors,
num_bins, // mel opts
low_freq,
high_freq,
vtln_low,
vtln_high,
debug_mel,
htk_mode,
use_energy, // fbank opts
energy_floor,
raw_energy,
htk_compat,
use_log_fbank,
use_power);
py::array_t<double> result = ComputeFbankStreaming(wav);
paddleaudio::KaldiFeatureWrapper::GetInstance()->ResetFbank();
return result;
}
void ResetFbank() {
paddleaudio::KaldiFeatureWrapper::GetInstance()->ResetFbank();
}
PYBIND11_MODULE(kaldi_featurepy, m) {
m.doc() = "kaldi_feature example";
m.def("InitFbank", &InitFbank, "init fbank");
m.def("ResetFbank", &ResetFbank, "reset fbank");
m.def("ComputeFbank", &ComputeFbank, "compute fbank");
m.def("ComputeFbankStreaming", &ComputeFbankStreaming, "compute fbank streaming");
}

@ -0,0 +1,71 @@
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "kaldi_feature_wrapper.h"
namespace py=pybind11;
bool InitFbank(
float samp_freq, // frame opts
float frame_shift_ms,
float frame_length_ms,
float dither,
float preemph_coeff,
bool remove_dc_offset,
std::string window_type, // e.g. Hamming window
bool round_to_power_of_two,
float blackman_coeff,
bool snip_edges,
bool allow_downsample,
bool allow_upsample,
int max_feature_vectors,
int num_bins, // mel opts
float low_freq,
float high_freq,
float vtln_low,
float vtln_high,
bool debug_mel,
bool htk_mode,
bool use_energy, // fbank opts
float energy_floor,
bool raw_energy,
bool htk_compat,
bool use_log_fbank,
bool use_power);
py::array_t<double> ComputeFbank(
float samp_freq, // frame opts
float frame_shift_ms,
float frame_length_ms,
float dither,
float preemph_coeff,
bool remove_dc_offset,
std::string window_type, // e.g. Hamming window
bool round_to_power_of_two,
kaldi::BaseFloat blackman_coeff,
bool snip_edges,
bool allow_downsample,
bool allow_upsample,
int max_feature_vectors,
int num_bins, // mel opts
float low_freq,
float high_freq,
float vtln_low,
float vtln_high,
bool debug_mel,
bool htk_mode,
bool use_energy, // fbank opts
float energy_floor,
bool raw_energy,
bool htk_compat,
bool use_log_fbank,
bool use_power,
const py::array_t<double>& wav);
py::array_t<double> ComputeFbankStreaming(const py::array_t<double>& wav);
void ResetFbank();
py::array_t<double> ComputeFbankStreaming(const py::array_t<double>& wav);
py::array_t<double> TestFun(const py::array_t<double>& wav);

@ -0,0 +1,57 @@
#include "kaldi_feature_wrapper.h"
namespace paddleaudio {
KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
static KaldiFeatureWrapper instance;
return &instance;
}
bool KaldiFeatureWrapper::InitFbank(kaldi::FbankOptions opts) {
fbank_.reset(new Fbank(opts));
return true;
}
py::array_t<double> KaldiFeatureWrapper::ComputeFbank(const py::array_t<double> wav) {
py::buffer_info info = wav.request();
kaldi::Vector<kaldi::BaseFloat> input_wav(info.size);
double* wav_ptr = (double*)info.ptr;
for (int idx = 0; idx < info.size; ++idx) {
input_wav(idx) = *wav_ptr;
wav_ptr++;
}
kaldi::Vector<kaldi::BaseFloat> feats;
bool flag = fbank_->ComputeFeature(input_wav, &feats);
if (flag == false || feats.Dim() == 0) return py::array_t<double>();
auto result = py::array_t<double>(feats.Dim());
py::buffer_info xs = result.request();
for (int idx = 0; idx < 10; ++idx) {
float val = feats(idx);
std::cout << val << " ";
}
std::cout << std::endl;
double* res_ptr = (double*)xs.ptr;
for (int idx = 0; idx < feats.Dim(); ++idx) {
*res_ptr = feats(idx);
res_ptr++;
}
return result.reshape({ feats.Dim() / Dim(), Dim()});
/*
py::buffer_info info = wav.request();
std::cout << info.size << std::endl;
auto result = py::array_t<double>(info.size);
//kaldi::Vector<kaldi::BaseFloat> input_wav(info.size);
kaldi::Vector<double> input_wav(info.size);
py::buffer_info info_re = result.request();
memcpy(input_wav.Data(), (double*)info.ptr, wav.nbytes());
memcpy((double*)info_re.ptr, input_wav.Data(), input_wav.Dim()* sizeof(double));
return result;
*/
}
} // namespace paddleaudio

@ -0,0 +1,28 @@
#include "base/kaldi-common.h"
#include "kaldi_frontend/feature_common.h"
#include "feat/feature-fbank.h"
#pragma once
namespace paddleaudio {
typedef StreamingFeatureTpl<kaldi::FbankComputer> Fbank;
class KaldiFeatureWrapper {
public:
static KaldiFeatureWrapper* GetInstance();
bool InitFbank(kaldi::FbankOptions opts);
py::array_t<double> ComputeFbank(const py::array_t<double> wav);
int Dim() {
return fbank_->Dim();
}
void ResetFbank() {
fbank_->Reset();
}
private:
std::unique_ptr<paddleaudio::Fbank> fbank_;
};
} // namespace paddleaudio

@ -0,0 +1,8 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/kaldi
)
add_subdirectory(kaldi)

@ -0,0 +1,66 @@
project(kaldi)
add_library(kaldi-base
base/io-funcs.cc
base/kaldi-error.cc
base/kaldi-math.cc
base/kaldi-utils.cc
base/timer.cc
)
add_library(kaldi-util
util/kaldi-holder.cc
util/kaldi-io.cc
util/kaldi-semaphore.cc
util/kaldi-table.cc
util/kaldi-thread.cc
util/parse-options.cc
util/simple-io-funcs.cc
util/simple-options.cc
util/text-utils.cc
)
target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
add_library(kaldi-mfcc
feat/feature-mfcc.cc
)
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
add_library(kaldi-fbank
feat/feature-fbank.cc
)
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
add_library(kaldi-feat-common
feat/wave-reader.cc
feat/signal.cc
feat/feature-functions.cc
feat/feature-window.cc
feat/resample.cc
feat/mel-computations.cc
feat/cmvn.cc
)
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
add_library(kaldi-matrix
matrix/compressed-matrix.cc
matrix/kaldi-matrix.cc
matrix/kaldi-vector.cc
matrix/matrix-functions.cc
matrix/optimization.cc
matrix/packed-matrix.cc
matrix/qr.cc
matrix/sparse-matrix.cc
matrix/sp-matrix.cc
matrix/srfft.cc
matrix/tp-matrix.cc
)
target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a)
#add_subdirectory(base)
#add_subdirectory(util)
#add_subdirectory(feat)
#add_subdirectory(matrix)

@ -42,6 +42,8 @@ typedef float BaseFloat;
// for discussion on what to do if you need compile kaldi
// without OpenFST, see the bottom of this this file
#ifndef COMPILE_WITHOUT_OPENFST
#include <fst/types.h>
namespace kaldi {
@ -55,9 +57,10 @@ namespace kaldi {
typedef double double64;
} // end namespace kaldi
#else
// In a theoretical case you decide compile Kaldi without the OpenFST
// comment the previous namespace statement and uncomment the following
/*
namespace kaldi {
typedef int8_t int8;
typedef int16_t int16;
@ -71,6 +74,6 @@ namespace kaldi {
typedef float float32;
typedef double double64;
} // end namespace kaldi
*/
#endif
#endif // KALDI_BASE_KALDI_TYPES_H_

Loading…
Cancel
Save