diff --git a/audio/CMakeLists.txt b/audio/CMakeLists.txt index 8d3c2f2e0..c2d64adec 100644 --- a/audio/CMakeLists.txt +++ b/audio/CMakeLists.txt @@ -1,16 +1,13 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) -project(paddlespeech) +project(paddleaudio VERSION 0.1) -# check and set CMAKE_CXX_STANDARD string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard) set(CMAKE_CXX_STANDARD 14) set(CMAKE_C_STANDARD 11) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -# Options option(BUILD_SOX "Build libsox statically" ON) add_subdirectory(third_party) diff --git a/audio/third_party/CMakeLists.txt b/audio/third_party/CMakeLists.txt index 3eeb1a972..c89a5c3df 100644 --- a/audio/third_party/CMakeLists.txt +++ b/audio/third_party/CMakeLists.txt @@ -1,3 +1,4 @@ + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") add_library(libsox INTERFACE) diff --git a/paddlespeech/audio/.gitignore b/paddlespeech/audio/.gitignore new file mode 100644 index 000000000..24343e79b --- /dev/null +++ b/paddlespeech/audio/.gitignore @@ -0,0 +1 @@ +fc_patch/ diff --git a/paddlespeech/audio/CMakeLists.txt b/paddlespeech/audio/CMakeLists.txt new file mode 100644 index 000000000..9515ad0fe --- /dev/null +++ b/paddlespeech/audio/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +project(paddleaudio VERSION 0.1) + +string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard) + +# cmake dir +set(paddleaudio_cmake_dir ${PROJECT_SOURCE_DIR}/cmake) + +# Modules +list(APPEND CMAKE_MODULE_PATH ${paddleaudio_cmake_dir}/external) +list(APPEND CMAKE_MODULE_PATH ${paddleaudio_cmake_dir}) +include(FetchContent) +include(ExternalProject) + +# fc_patch dir +set(FETCHCONTENT_QUIET off) +get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_patch}) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -fPIC -O0 -Wall -g") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_C_STANDARD 11) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +option(BUILD_SOX "Build libsox statically" ON) + + +# checkout the thirdparty/kaldi/base/kaldi-types.h +# compile kaldi without openfst +add_definitions("-DCOMPILE_WITHOUT_OPENFST") + +include(openblas) +include(pybind) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/third_party/kaldi) +include_directories(/usr/include/python3.7m) +add_subdirectory(third_party) +add_subdirectory(csrc) diff --git a/paddlespeech/audio/cmake/FindGFortranLibs.cmake b/paddlespeech/audio/cmake/FindGFortranLibs.cmake new file mode 100644 index 000000000..763f78833 --- /dev/null +++ b/paddlespeech/audio/cmake/FindGFortranLibs.cmake @@ -0,0 +1,145 @@ +#.rst: +# FindGFortranLibs +# -------- +# https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake +# https://enccs.github.io/cmake-workshop/cxx-fortran/ +# +# Find gcc Fortran compiler & library paths +# +# The module defines the following variables: +# +# :: +# +# +# GFORTRANLIBS_FOUND - true if system has gfortran +# LIBGFORTRAN_LIBRARIES - path to libgfortran +# LIBQUADMATH_LIBRARIES - path to libquadmath +# GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath +# GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers +# LIBGOMP_LIBRARIES - path to libgomp +# LIBGOMP_INCLUDE_DIR - directory containing omp.h header +# GFORTRAN_VERSION_STRING - version of gfortran found +# +set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY}) + +if(NOT CMAKE_REQUIRED_QUIET) + message(STATUS "Looking for gfortran related libraries...") +endif() + +enable_language(Fortran) +if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU") + + # Basically, call "gfortran -v" to dump compiler info to the string + # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths + message(STATUS "Extracting library and header information by calling 'gfortran -v'...") + execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE + GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG) + + # For debugging + message(STATUS "'gfortran -v' returned:") + message(STATUS "${GFORTRAN_VERBOSE_STR}") + + # Detect gfortran version + string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}") + string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}") + message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}") + unset(GFORTRAN_VER_STR) + + set(MATCH_REGEX "[^\t\n ]+[\t\n ]+") + set(REPLACE_REGEX "([^\t\n ]+)") + + # Find architecture for compiler + string(REGEX MATCH "Target: [^\t\n ]+" + GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}") + message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}") + string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1" + GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}") + message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}") + unset(GFORTRAN_ARCH_STR) + + # Find install prefix, if it exists; if not, use default + string(REGEX MATCH "--prefix=[^\t\n ]+[\t\n ]+" + GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}") + if(NOT GFORTRAN_PREFIX_STR) + message(STATUS "Detected default gfortran prefix") + set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install + else() + string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1" + GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}") + endif() + message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}") + unset(GFORTRAN_PREFIX_STR) + + # Find install exec-prefix, if it exists; if not, use default + string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1" + GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}") + if(NOT GFORTRAN_EXEC_PREFIX_STR) + message(STATUS "Detected default gfortran exec-prefix") + set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}") + else() + string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1" + GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}") + endif() + message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}") + UNSET(GFORTRAN_EXEC_PREFIX_STR) + + # Find library directory and include directory, if library directory specified + string(REGEX MATCH "--libdir=[^\t\n ]+" + GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}") + if(NOT GFORTRAN_LIB_DIR_STR) + message(STATUS "Found --libdir flag -- not found") + message(STATUS "Using default gfortran library & include directory paths") + set(GFORTRAN_LIBRARIES_DIR + "${GFORTRAN_EXEC_PREFIX_DIR}/lib/gcc/${GFORTRAN_ARCH}/${GFORTRAN_VERSION_STRING}") + string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/include") + else() + message(STATUS "Found --libdir flag -- yes") + string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1" + GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}") + string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include") + endif() + message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}") + message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}") + unset(GFORTRAN_LIB_DIR_STR) + + # There are lots of other build options for gcc & gfortran. For now, the + # options implemented above should cover a lot of common use cases. + + # Clean up be deleting the output string from "gfortran -v" + unset(GFORTRAN_VERBOSE_STR) + + # Find paths for libgfortran, libquadmath, libgomp + # libgomp needed for OpenMP support without Clang + find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran + HINTS ${GFORTRAN_LIBRARIES_DIR}) + find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath + HINTS ${GFORTRAN_LIBRARIES_DIR}) + find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp + HINTS ${GFORTRAN_LIBRARIES_DIR}) + + # Find OpenMP headers + find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR}) + +else() + message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!") +endif() + +include(FindPackageHandleStandardArgs) + +# Required: libgfortran, libquadmath, path for gfortran libraries +# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers +find_package_handle_standard_args(GFortranLibs + REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR + VERSION_VAR GFORTRAN_VERSION_STRING) + +if(GFORTRANLIBS_FOUND) + message(STATUS "Looking for gfortran libraries -- found") + message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}") +else() + message(STATUS "Looking for gfortran libraries -- not found") +endif() + +mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES + LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR + GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR) +# FindGFortranLIBS.cmake ends here \ No newline at end of file diff --git a/paddlespeech/audio/cmake/external/openblas.cmake b/paddlespeech/audio/cmake/external/openblas.cmake new file mode 100644 index 000000000..0c204f3a2 --- /dev/null +++ b/paddlespeech/audio/cmake/external/openblas.cmake @@ -0,0 +1,58 @@ +include(FetchContent) + +set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src) +set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix) + +# ###################################################################################################################### +# OPENBLAS https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575 +# ###################################################################################################################### +enable_language(Fortran) + +include(FortranCInterface) + +# # Clang doesn't have a Fortran compiler in its suite (yet), +# # so detect libraries for gfortran; we need equivalents to +# # libgfortran and libquadmath, which are implicitly +# # linked by flags in CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES +# include(FindGFortranLibs REQUIRED) +# # Add directory containing libgfortran and libquadmath to +# # linker. Should also contain libgomp, if not using +# # Intel OpenMP runtime +# link_directories(${GFORTRAN_LIBRARIES_DIR}) +# # gfortan dir in the docker. +# link_directories(/usr/local/gcc-8.2/lib64) +# # if you are working with C and Fortran +# FortranCInterface_VERIFY() + +# # if you are working with C++ and Fortran +# FortranCInterface_VERIFY(CXX) + + +#TODO: switch to CPM +include(GNUInstallDirs) +ExternalProject_Add( + OPENBLAS + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG v0.3.10 + GIT_SHALLOW YES + PREFIX ${OpenBLAS_PREFIX} + SOURCE_DIR ${OpenBLAS_SOURCE_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + CMAKE_GENERATOR "Unix Makefiles") + + +# https://cmake.org/cmake/help/latest/module/ExternalProject.html?highlight=externalproject_get_property#external-project-definition +ExternalProject_Get_Property(OPENBLAS INSTALL_DIR) +set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR}) +add_library(openblas STATIC IMPORTED) +add_dependencies(openblas OPENBLAS) +set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran) +# ${CMAKE_INSTALL_LIBDIR} lib +set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libopenblas.a) + + +# https://cmake.org/cmake/help/latest/command/install.html?highlight=cmake_install_libdir#installing-targets +# ${CMAKE_INSTALL_LIBDIR} lib +# ${CMAKE_INSTALL_INCLUDEDIR} include +link_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) +include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/openblas) diff --git a/paddlespeech/audio/cmake/external/pybind.cmake b/paddlespeech/audio/cmake/external/pybind.cmake new file mode 100644 index 000000000..2d74e61eb --- /dev/null +++ b/paddlespeech/audio/cmake/external/pybind.cmake @@ -0,0 +1,9 @@ +include(FetchContent) +FetchContent_Declare( + pybind + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.0.zip + URL_HASH SHA256=1c6e0141f7092867c5bf388bc3acdb2689ed49f59c3977651394c6c87ae88232 +) +FetchContent_MakeAvailable(pybind) +include_directories(${pybind_SOURCE_DIR}/include) + diff --git a/paddlespeech/audio/csrc/CMakeLists.txt b/paddlespeech/audio/csrc/CMakeLists.txt index aa77302ab..cbced3d02 100644 --- a/paddlespeech/audio/csrc/CMakeLists.txt +++ b/paddlespeech/audio/csrc/CMakeLists.txt @@ -32,3 +32,5 @@ if(BUILD_SOX) "${LINK_LIBRARIES}" ) endif() + +add_subdirectory(pybind/kaldi_frontend) diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/CMakeLists.txt b/paddlespeech/audio/csrc/pybind/kaldi_frontend/CMakeLists.txt new file mode 100644 index 000000000..7cb240aab --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/CMakeLists.txt @@ -0,0 +1,15 @@ + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +) + +add_library(kaldi_feature + kaldi_feature.cc + kaldi_feature_wrapper.cc +) +target_link_libraries(kaldi_feature kaldi-fbank) + +pybind11_add_module(kaldi_frontend kaldi_feature.cc kaldi_feature_wrapper.cc) +target_link_libraries(kaldi_frontend PRIVATE kaldi_feature) diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/feature_common.h b/paddlespeech/audio/csrc/pybind/kaldi_frontend/feature_common.h new file mode 100644 index 000000000..41850cbe0 --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/feature_common.h @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "feat/feature-window.h" +#include +#include + +namespace paddleaudio { + +namespace py = pybind11; + +template +class StreamingFeatureTpl { + public: + typedef typename F::Options Options; + StreamingFeatureTpl(const Options& opts); + bool ComputeFeature(const kaldi::VectorBase& wav, + kaldi::Vector* feats); + void Reset() { + remained_wav_.Resize(0); + } + + int Dim() { + return computer_.Dim(); + } + + private: + bool Compute(const kaldi::Vector& waves, + kaldi::Vector* feats); + Options opts_; + kaldi::FeatureWindowFunction window_function_; + kaldi::Vector remained_wav_; + F computer_; +}; + +} // namespace ppspeech + +#include "feature_common_inl.h" + diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/feature_common_inl.h b/paddlespeech/audio/csrc/pybind/kaldi_frontend/feature_common_inl.h new file mode 100644 index 000000000..db45b26a2 --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/feature_common_inl.h @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" + +namespace paddleaudio { + +template +StreamingFeatureTpl::StreamingFeatureTpl( + const Options& opts) + : opts_(opts), computer_(opts), + window_function_(opts.frame_opts) { + //window_function_(computer_.GetFrameOptions()) { the opt set to zero +} + +template +bool StreamingFeatureTpl::ComputeFeature(const kaldi::VectorBase& wav, + kaldi::Vector* feats) { + // append remaned waves + kaldi::int32 wav_len = wav.Dim(); + if (wav_len == 0) return false; + kaldi::int32 left_len = remained_wav_.Dim(); + kaldi::Vector waves(left_len + wav_len); + waves.Range(0, left_len).CopyFromVec(remained_wav_); + waves.Range(left_len, wav_len).CopyFromVec(wav); + + // cache remaned waves + kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); + kaldi::int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts); + kaldi::int32 frame_shift = frame_opts.WindowShift(); + kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames; + remained_wav_.Resize(left_samples); + remained_wav_.CopyFromVec( + waves.Range(frame_shift * num_frames, left_samples)); + + // compute speech feature + Compute(waves, feats); + return true; +} + +// Compute feat +template +bool StreamingFeatureTpl::Compute( + const kaldi::Vector& waves, + kaldi::Vector* feats) { + kaldi::BaseFloat vtln_warp = 1.0; + const kaldi::FrameExtractionOptions& frame_opts = + computer_.GetFrameOptions(); + kaldi::int32 num_samples = waves.Dim(); + kaldi::int32 frame_length = frame_opts.WindowSize(); + kaldi::int32 sample_rate = frame_opts.samp_freq; + if (num_samples < frame_length) { + return false; + } + + kaldi::int32 num_frames = kaldi::NumFrames(num_samples, frame_opts); + feats->Resize(num_frames * Dim()); + + kaldi::Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (kaldi::int32 frame = 0; frame < num_frames; frame++) { + kaldi::BaseFloat raw_log_energy = 0.0; + kaldi::ExtractWindow(0, + waves, + frame, + frame_opts, + window_function_, + &window, + need_raw_log_energy ? &raw_log_energy : NULL); + + kaldi::Vector this_feature(computer_.Dim(), + kaldi::kUndefined); + computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); + kaldi::SubVector output_row( + feats->Data() + frame * Dim(), Dim()); + output_row.CopyFromVec(this_feature); + } + return true; +} + +} // namespace paddleaudio diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature.cc b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature.cc new file mode 100644 index 000000000..21e71d8d0 --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature.cc @@ -0,0 +1,143 @@ + +#include +#include + +#include "kaldi_feature_wrapper.h" + +namespace py=pybind11; + +bool InitFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + float blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power) { + kaldi::FbankOptions opts; + opts.frame_opts.samp_freq = samp_freq; // frame opts + opts.frame_opts.frame_shift_ms = frame_shift_ms; + opts.frame_opts.frame_length_ms = frame_length_ms; + opts.frame_opts.dither = dither; + opts.frame_opts.preemph_coeff = preemph_coeff; + opts.frame_opts.remove_dc_offset = remove_dc_offset; + opts.frame_opts.window_type = window_type; + opts.frame_opts.round_to_power_of_two = round_to_power_of_two; + opts.frame_opts.blackman_coeff = blackman_coeff; + opts.frame_opts.snip_edges = snip_edges; + opts.frame_opts.allow_downsample = allow_downsample; + opts.frame_opts.allow_upsample = allow_upsample; + opts.frame_opts.max_feature_vectors = max_feature_vectors; + + opts.mel_opts.num_bins = num_bins; // mel opts + opts.mel_opts.low_freq = low_freq; + opts.mel_opts.high_freq = high_freq; + opts.mel_opts.vtln_low = vtln_low; + opts.mel_opts.vtln_high = vtln_high; + opts.mel_opts.debug_mel = debug_mel; + opts.mel_opts.htk_mode = htk_mode; + + opts.use_energy = use_energy; // fbank opts + opts.energy_floor = energy_floor; + opts.raw_energy = raw_energy; + opts.htk_compat = htk_compat; + opts.use_log_fbank = use_log_fbank; + opts.use_power = use_power; + paddleaudio::KaldiFeatureWrapper::GetInstance()->InitFbank(opts); + return true; +} + +py::array_t ComputeFbankStreaming(const py::array_t& wav) { + return paddleaudio::KaldiFeatureWrapper::GetInstance()->ComputeFbank(wav); +} + +py::array_t ComputeFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + float blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power, + const py::array_t& wav) { + InitFbank(samp_freq, // frame opts + frame_shift_ms, + frame_length_ms, + dither, + preemph_coeff, + remove_dc_offset, + window_type, // e.g. Hamming window + round_to_power_of_two, + blackman_coeff, + snip_edges, + allow_downsample, + allow_upsample, + max_feature_vectors, + num_bins, // mel opts + low_freq, + high_freq, + vtln_low, + vtln_high, + debug_mel, + htk_mode, + use_energy, // fbank opts + energy_floor, + raw_energy, + htk_compat, + use_log_fbank, + use_power); + py::array_t result = ComputeFbankStreaming(wav); + paddleaudio::KaldiFeatureWrapper::GetInstance()->ResetFbank(); + return result; +} + + +void ResetFbank() { + paddleaudio::KaldiFeatureWrapper::GetInstance()->ResetFbank(); +} + +PYBIND11_MODULE(kaldi_featurepy, m) { + m.doc() = "kaldi_feature example"; + m.def("InitFbank", &InitFbank, "init fbank"); + m.def("ResetFbank", &ResetFbank, "reset fbank"); + m.def("ComputeFbank", &ComputeFbank, "compute fbank"); + m.def("ComputeFbankStreaming", &ComputeFbankStreaming, "compute fbank streaming"); +} diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature.h b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature.h new file mode 100644 index 000000000..b24416b18 --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature.h @@ -0,0 +1,71 @@ +#include +#include + +#include "kaldi_feature_wrapper.h" + +namespace py=pybind11; + +bool InitFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + float blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power); + +py::array_t ComputeFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + kaldi::BaseFloat blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power, + const py::array_t& wav); + +py::array_t ComputeFbankStreaming(const py::array_t& wav); + +void ResetFbank(); + +py::array_t ComputeFbankStreaming(const py::array_t& wav); + +py::array_t TestFun(const py::array_t& wav); diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature_wrapper.cc b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature_wrapper.cc new file mode 100644 index 000000000..a965fd6a2 --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature_wrapper.cc @@ -0,0 +1,57 @@ +#include "kaldi_feature_wrapper.h" + +namespace paddleaudio { + +KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() { + static KaldiFeatureWrapper instance; + return &instance; +} + +bool KaldiFeatureWrapper::InitFbank(kaldi::FbankOptions opts) { + fbank_.reset(new Fbank(opts)); + return true; +} + +py::array_t KaldiFeatureWrapper::ComputeFbank(const py::array_t wav) { + py::buffer_info info = wav.request(); + kaldi::Vector input_wav(info.size); + double* wav_ptr = (double*)info.ptr; + for (int idx = 0; idx < info.size; ++idx) { + input_wav(idx) = *wav_ptr; + wav_ptr++; + } + + + kaldi::Vector feats; + bool flag = fbank_->ComputeFeature(input_wav, &feats); + if (flag == false || feats.Dim() == 0) return py::array_t(); + auto result = py::array_t(feats.Dim()); + py::buffer_info xs = result.request(); + for (int idx = 0; idx < 10; ++idx) { + float val = feats(idx); + std::cout << val << " "; + } + std::cout << std::endl; + double* res_ptr = (double*)xs.ptr; + for (int idx = 0; idx < feats.Dim(); ++idx) { + *res_ptr = feats(idx); + res_ptr++; + } + + return result.reshape({ feats.Dim() / Dim(), Dim()}); +/* + py::buffer_info info = wav.request(); + std::cout << info.size << std::endl; + auto result = py::array_t(info.size); + //kaldi::Vector input_wav(info.size); + kaldi::Vector input_wav(info.size); + py::buffer_info info_re = result.request(); + + memcpy(input_wav.Data(), (double*)info.ptr, wav.nbytes()); + memcpy((double*)info_re.ptr, input_wav.Data(), input_wav.Dim()* sizeof(double)); + return result; +*/ +} + + +} // namespace paddleaudio diff --git a/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature_wrapper.h b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature_wrapper.h new file mode 100644 index 000000000..06e49093d --- /dev/null +++ b/paddlespeech/audio/csrc/pybind/kaldi_frontend/kaldi_feature_wrapper.h @@ -0,0 +1,28 @@ + +#include "base/kaldi-common.h" +#include "feature_common.h" +#include "feat/feature-fbank.h" + +#pragma once + +namespace paddleaudio { + +typedef StreamingFeatureTpl Fbank; + +class KaldiFeatureWrapper { + public: + static KaldiFeatureWrapper* GetInstance(); + bool InitFbank(kaldi::FbankOptions opts); + py::array_t ComputeFbank(const py::array_t wav); + int Dim() { + return fbank_->Dim(); + } + void ResetFbank() { + fbank_->Reset(); + } + + private: + std::unique_ptr fbank_; +}; + +} // namespace paddleaudio diff --git a/paddlespeech/audio/third_party/CMakeLists.txt b/paddlespeech/audio/third_party/CMakeLists.txt index 3eeb1a972..eac8c31b8 100644 --- a/paddlespeech/audio/third_party/CMakeLists.txt +++ b/paddlespeech/audio/third_party/CMakeLists.txt @@ -6,3 +6,9 @@ if (BUILD_SOX) target_include_directories(libsox INTERFACE ${SOX_INCLUDE_DIR}) target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES}) endif() + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/kaldi +) +add_subdirectory(kaldi) diff --git a/paddlespeech/audio/third_party/kaldi/CMakeLists.txt b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt new file mode 100644 index 000000000..d68540cb4 --- /dev/null +++ b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt @@ -0,0 +1,60 @@ +project(kaldi) + + +add_library(kaldi-base + base/io-funcs.cc + base/kaldi-error.cc + base/kaldi-math.cc + base/kaldi-utils.cc + base/timer.cc +) + +add_library(kaldi-util + util/kaldi-holder.cc + util/kaldi-io.cc + util/kaldi-semaphore.cc + util/kaldi-table.cc + util/kaldi-thread.cc + util/parse-options.cc + util/simple-io-funcs.cc + util/simple-options.cc + util/text-utils.cc +) +target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix) + +add_library(kaldi-mfcc + feat/feature-mfcc.cc +) +target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common) + +add_library(kaldi-fbank + feat/feature-fbank.cc +) +target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common) + +add_library(kaldi-feat-common + feat/wave-reader.cc + feat/signal.cc + feat/feature-functions.cc + feat/feature-window.cc + feat/resample.cc + feat/mel-computations.cc + feat/cmvn.cc +) +target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util) + +add_library(kaldi-matrix + matrix/compressed-matrix.cc + matrix/kaldi-matrix.cc + matrix/kaldi-vector.cc + matrix/matrix-functions.cc + matrix/optimization.cc + matrix/packed-matrix.cc + matrix/qr.cc + matrix/sparse-matrix.cc + matrix/sp-matrix.cc + matrix/srfft.cc + matrix/tp-matrix.cc +) +target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a) + diff --git a/paddlespeech/audio/third_party/kaldi/base b/paddlespeech/audio/third_party/kaldi/base new file mode 120000 index 000000000..cf286c165 --- /dev/null +++ b/paddlespeech/audio/third_party/kaldi/base @@ -0,0 +1 @@ +../../../../speechx/speechx/kaldi/base \ No newline at end of file diff --git a/paddlespeech/audio/third_party/kaldi/feat b/paddlespeech/audio/third_party/kaldi/feat new file mode 120000 index 000000000..796991243 --- /dev/null +++ b/paddlespeech/audio/third_party/kaldi/feat @@ -0,0 +1 @@ +../../../../speechx/speechx/kaldi/feat \ No newline at end of file diff --git a/paddlespeech/audio/third_party/kaldi/matrix b/paddlespeech/audio/third_party/kaldi/matrix new file mode 120000 index 000000000..184fa3233 --- /dev/null +++ b/paddlespeech/audio/third_party/kaldi/matrix @@ -0,0 +1 @@ +../../../../speechx/speechx/kaldi/matrix \ No newline at end of file diff --git a/paddlespeech/audio/third_party/kaldi/util b/paddlespeech/audio/third_party/kaldi/util new file mode 120000 index 000000000..f3017b602 --- /dev/null +++ b/paddlespeech/audio/third_party/kaldi/util @@ -0,0 +1 @@ +../../../../speechx/speechx/kaldi/util \ No newline at end of file diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h index 4fa8f224b..23ef4c4c5 100644 --- a/speechx/speechx/kaldi/base/kaldi-types.h +++ b/speechx/speechx/kaldi/base/kaldi-types.h @@ -42,6 +42,8 @@ typedef float BaseFloat; // for discussion on what to do if you need compile kaldi // without OpenFST, see the bottom of this this file +#ifndef COMPILE_WITHOUT_OPENFST + #include namespace kaldi { @@ -55,9 +57,10 @@ namespace kaldi { typedef double double64; } // end namespace kaldi +#else // In a theoretical case you decide compile Kaldi without the OpenFST // comment the previous namespace statement and uncomment the following -/* + namespace kaldi { typedef int8_t int8; typedef int16_t int16; @@ -71,6 +74,6 @@ namespace kaldi { typedef float float32; typedef double double64; } // end namespace kaldi -*/ +#endif #endif // KALDI_BASE_KALDI_TYPES_H_