[audio] mv paddlespeech/audio to paddleaudio (#2706)
* split paddlespeech/audio to paddleaudio. * add sox io ,sox effect, kaldi native fbank to paddleaudio.pull/2733/head
parent
0cc54bb785
commit
42ff946007
@ -0,0 +1,70 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||||
|
|
||||||
|
# Use compiler ID "AppleClang" instead of "Clang" for XCode.
|
||||||
|
# Not setting this sometimes makes XCode C compiler gets detected as "Clang",
|
||||||
|
# even when the C++ one is detected as "AppleClang".
|
||||||
|
cmake_policy(SET CMP0010 NEW)
|
||||||
|
cmake_policy(SET CMP0025 NEW)
|
||||||
|
|
||||||
|
# Suppress warning flags in default MSVC configuration. It's not
|
||||||
|
# mandatory that we do this (and we don't if cmake is old), but it's
|
||||||
|
# nice when it's possible, and it's possible on our Windows configs.
|
||||||
|
if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
|
||||||
|
cmake_policy(SET CMP0092 NEW)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
project(paddleaudio)
|
||||||
|
|
||||||
|
# check and set CMAKE_CXX_STANDARD
|
||||||
|
string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
|
||||||
|
if(env_cxx_standard GREATER -1)
|
||||||
|
message(
|
||||||
|
WARNING "C++ standard version definition detected in environment variable."
|
||||||
|
"paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 14)
|
||||||
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||||
|
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
|
|
||||||
|
# Options
|
||||||
|
option(BUILD_SOX "Build libsox statically" ON)
|
||||||
|
option(BUILD_MAD "Enable libmad" ON)
|
||||||
|
option(BUILD_KALDI "Build kaldi statically" ON)
|
||||||
|
option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
|
||||||
|
|
||||||
|
|
||||||
|
# cmake
|
||||||
|
set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
find_package(GFortranLibs REQUIRED)
|
||||||
|
include(FortranCInterface)
|
||||||
|
include(FindGFortranLibs REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# fc_patch dir
|
||||||
|
set(FETCHCONTENT_QUIET off)
|
||||||
|
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
|
||||||
|
set(FETCHCONTENT_BASE_DIR ${fc_patch})
|
||||||
|
set(THIRD_PARTY_PATH ${fc_patch})
|
||||||
|
|
||||||
|
include(openblas)
|
||||||
|
|
||||||
|
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
|
||||||
|
include(cmake/pybind.cmake)
|
||||||
|
include_directories(${PYTHON_INCLUDE_DIR})
|
||||||
|
|
||||||
|
# packages
|
||||||
|
find_package(Python3 COMPONENTS Interpreter Development)
|
||||||
|
|
||||||
|
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
|
||||||
|
add_subdirectory(paddleaudio)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
include(cmake/summary.cmake)
|
||||||
|
onnx_print_configuration_summary()
|
@ -0,0 +1,153 @@
|
|||||||
|
#.rst:
|
||||||
|
# FindGFortranLibs
|
||||||
|
# --------
|
||||||
|
# https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
|
||||||
|
# https://enccs.github.io/cmake-workshop/cxx-fortran/
|
||||||
|
#
|
||||||
|
# Find gcc Fortran compiler & library paths
|
||||||
|
#
|
||||||
|
# The module defines the following variables:
|
||||||
|
#
|
||||||
|
# ::
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# GFORTRANLIBS_FOUND - true if system has gfortran
|
||||||
|
# LIBGFORTRAN_LIBRARIES - path to libgfortran
|
||||||
|
# LIBQUADMATH_LIBRARIES - path to libquadmath
|
||||||
|
# GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
|
||||||
|
# GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
|
||||||
|
# LIBGOMP_LIBRARIES - path to libgomp
|
||||||
|
# LIBGOMP_INCLUDE_DIR - directory containing omp.h header
|
||||||
|
# GFORTRAN_VERSION_STRING - version of gfortran found
|
||||||
|
#
|
||||||
|
set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
|
||||||
|
|
||||||
|
if(NOT CMAKE_REQUIRED_QUIET)
|
||||||
|
message(STATUS "Looking for gfortran related libraries...")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
enable_language(Fortran)
|
||||||
|
if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
|
||||||
|
|
||||||
|
# Basically, call "gfortran -v" to dump compiler info to the string
|
||||||
|
# GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
|
||||||
|
message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
|
||||||
|
execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
|
||||||
|
GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
|
||||||
|
|
||||||
|
# For debugging
|
||||||
|
message(STATUS "'gfortran -v' returned:")
|
||||||
|
message(STATUS "${GFORTRAN_VERBOSE_STR}")
|
||||||
|
|
||||||
|
# Detect gfortran version
|
||||||
|
string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
|
||||||
|
string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
|
||||||
|
message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
|
||||||
|
unset(GFORTRAN_VER_STR)
|
||||||
|
|
||||||
|
set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
|
||||||
|
set(REPLACE_REGEX "([^\t\n ]+)")
|
||||||
|
|
||||||
|
# Find architecture for compiler
|
||||||
|
string(REGEX MATCH "Target: [^\t\n ]+"
|
||||||
|
GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
|
||||||
|
message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
|
||||||
|
string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
|
||||||
|
GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
|
||||||
|
message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
|
||||||
|
unset(GFORTRAN_ARCH_STR)
|
||||||
|
|
||||||
|
# Find install prefix, if it exists; if not, use default
|
||||||
|
string(REGEX MATCH "--prefix=[^\t\n ]+[\t\n ]+"
|
||||||
|
GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
|
||||||
|
if(NOT GFORTRAN_PREFIX_STR)
|
||||||
|
message(STATUS "Detected default gfortran prefix")
|
||||||
|
set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
|
||||||
|
else()
|
||||||
|
string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
|
||||||
|
GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
|
||||||
|
endif()
|
||||||
|
message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
|
||||||
|
unset(GFORTRAN_PREFIX_STR)
|
||||||
|
|
||||||
|
# Find install exec-prefix, if it exists; if not, use default
|
||||||
|
string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
|
||||||
|
GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
|
||||||
|
if(NOT GFORTRAN_EXEC_PREFIX_STR)
|
||||||
|
message(STATUS "Detected default gfortran exec-prefix")
|
||||||
|
set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
|
||||||
|
else()
|
||||||
|
string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
|
||||||
|
GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
|
||||||
|
endif()
|
||||||
|
message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
|
||||||
|
UNSET(GFORTRAN_EXEC_PREFIX_STR)
|
||||||
|
|
||||||
|
# Find library directory and include directory, if library directory specified
|
||||||
|
string(REGEX MATCH "--libdir=[^\t\n ]+"
|
||||||
|
GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
|
||||||
|
if(NOT GFORTRAN_LIB_DIR_STR)
|
||||||
|
message(STATUS "Found --libdir flag -- not found")
|
||||||
|
message(STATUS "Using default gfortran library & include directory paths")
|
||||||
|
string(STRIP ${GFORTRAN_PREFIX_DIR} TMPLIBDIR)
|
||||||
|
set(GFORTRAN_LIBRARIES_DIR "${TMPLIBDIR}/lib64")
|
||||||
|
set(GFORTRAN_INCLUDE_DIR "${TMPLIBDIR}/include")
|
||||||
|
else()
|
||||||
|
message(STATUS "Found --libdir flag -- yes")
|
||||||
|
string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
|
||||||
|
GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
|
||||||
|
string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
|
||||||
|
endif()
|
||||||
|
message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
|
||||||
|
message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
|
||||||
|
unset(GFORTRAN_LIB_DIR_STR)
|
||||||
|
|
||||||
|
# There are lots of other build options for gcc & gfortran. For now, the
|
||||||
|
# options implemented above should cover a lot of common use cases.
|
||||||
|
|
||||||
|
# Clean up be deleting the output string from "gfortran -v"
|
||||||
|
unset(GFORTRAN_VERBOSE_STR)
|
||||||
|
|
||||||
|
# Find paths for libgfortran, libquadmath, libgomp
|
||||||
|
# libgomp needed for OpenMP support without Clang
|
||||||
|
find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
|
||||||
|
HINTS ${GFORTRAN_LIBRARIES_DIR})
|
||||||
|
find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
|
||||||
|
HINTS ${GFORTRAN_LIBRARIES_DIR})
|
||||||
|
find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
|
||||||
|
HINTS ${GFORTRAN_LIBRARIES_DIR})
|
||||||
|
|
||||||
|
# Find OpenMP headers
|
||||||
|
find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
|
||||||
|
|
||||||
|
else()
|
||||||
|
message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
|
||||||
|
# Required: libgfortran, libquadmath, path for gfortran libraries
|
||||||
|
# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
|
||||||
|
find_package_handle_standard_args(GFortranLibs
|
||||||
|
REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
|
||||||
|
VERSION_VAR GFORTRAN_VERSION_STRING)
|
||||||
|
|
||||||
|
if(GFORTRANLIBS_FOUND)
|
||||||
|
message(STATUS "Looking for gfortran libraries -- found")
|
||||||
|
message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Looking for gfortran libraries -- not found")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
|
||||||
|
LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
|
||||||
|
GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
|
||||||
|
# FindGFortranLIBS.cmake ends here
|
||||||
|
|
||||||
|
|
||||||
|
message(STATUS LIBGFORTRAN_LIBRARIES= ${LIBGFORTRAN_LIBRARIES})
|
||||||
|
message(STATUS LIBQUADMATH_LIBRARIES= ${LIBQUADMATH_LIBRARIES})
|
||||||
|
message(STATUS LIBGOMP_LIBRARIES= ${LIBGOMP_LIBRARIES})
|
||||||
|
message(STATUS LIBGOMP_INCLUDE_DIR= ${LIBGOMP_INCLUDE_DIR})
|
||||||
|
message(STATUS GFORTRAN_LIBRARIES_DIR= ${GFORTRAN_LIBRARIES_DIR})
|
||||||
|
message(STATUS GFORTRAN_INCLUDE_DIR= ${GFORTRAN_INCLUDE_DIR})
|
@ -0,0 +1,119 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
include(ExternalProject)
|
||||||
|
|
||||||
|
set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
|
||||||
|
set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
|
||||||
|
set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
|
||||||
|
set(CBLAS_TAG v0.3.10)
|
||||||
|
|
||||||
|
if(NOT WIN32)
|
||||||
|
set(CBLAS_LIBRARIES
|
||||||
|
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
|
||||||
|
CACHE FILEPATH "openblas library." FORCE)
|
||||||
|
set(CBLAS_INC_DIR
|
||||||
|
"${CBLAS_INSTALL_DIR}/include"
|
||||||
|
CACHE PATH "openblas include directory." FORCE)
|
||||||
|
set(OPENBLAS_CC
|
||||||
|
"${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
|
||||||
|
|
||||||
|
if(APPLE)
|
||||||
|
set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
|
||||||
|
endif()
|
||||||
|
set(OPTIONAL_ARGS "")
|
||||||
|
set(COMMON_ARGS "")
|
||||||
|
|
||||||
|
if(APPLE)
|
||||||
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
|
||||||
|
set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
|
||||||
|
endif()
|
||||||
|
set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
ExternalProject_Add(
|
||||||
|
OPENBLAS
|
||||||
|
URL "https://paddleaudio.bj.bcebos.com/build/OpenBLAS-0.3.10.zip"
|
||||||
|
GIT_SHALLOW YES
|
||||||
|
DOWNLOAD_DIR ${CBLAS_PREFIX_DIR}
|
||||||
|
SOURCE_DIR ${CBLAS_PREFIX_DIR}
|
||||||
|
INSTALL_DIR ${CBLAS_INSTALL_DIR}
|
||||||
|
BUILD_IN_SOURCE 1
|
||||||
|
BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
|
||||||
|
INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
|
||||||
|
UPDATE_COMMAND ""
|
||||||
|
CONFIGURE_COMMAND ""
|
||||||
|
BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
|
||||||
|
|
||||||
|
ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
|
||||||
|
set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
|
||||||
|
add_library(openblas STATIC IMPORTED)
|
||||||
|
add_dependencies(openblas OPENBLAS)
|
||||||
|
set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
|
||||||
|
set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
|
||||||
|
|
||||||
|
link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
|
||||||
|
include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
|
||||||
|
|
||||||
|
set(OPENBLAS_LIBRARIES
|
||||||
|
${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(libopenblas INTERFACE)
|
||||||
|
add_dependencies(libopenblas openblas)
|
||||||
|
target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
|
||||||
|
target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
|
||||||
|
else()
|
||||||
|
set(CBLAS_LIBRARIES
|
||||||
|
"${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
|
||||||
|
CACHE FILEPATH "openblas library." FORCE)
|
||||||
|
set(CBLAS_INC_DIR
|
||||||
|
"${CBLAS_INSTALL_DIR}/include/openblas"
|
||||||
|
CACHE PATH "openblas include directory." FORCE)
|
||||||
|
ExternalProject_Add(
|
||||||
|
extern_openblas
|
||||||
|
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||||
|
GIT_REPOSITORY ${CBLAS_REPOSITORY}
|
||||||
|
GIT_TAG ${CBLAS_TAG}
|
||||||
|
PREFIX ${CBLAS_PREFIX_DIR}
|
||||||
|
INSTALL_DIR ${CBLAS_INSTALL_DIR}
|
||||||
|
BUILD_IN_SOURCE 0
|
||||||
|
UPDATE_COMMAND ""
|
||||||
|
CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
|
||||||
|
-DCMAKE_CXX_COMPILER=clang-cl
|
||||||
|
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
|
||||||
|
-DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
|
||||||
|
-DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
|
||||||
|
-DCMAKE_MT=mt
|
||||||
|
-DUSE_THREAD=OFF
|
||||||
|
-DBUILD_WITHOUT_LAPACK=NO
|
||||||
|
-DCMAKE_Fortran_COMPILER=flang
|
||||||
|
-DNOFORTRAN=0
|
||||||
|
-DDYNAMIC_ARCH=ON
|
||||||
|
#${EXTERNAL_OPTIONAL_ARGS}
|
||||||
|
CMAKE_CACHE_ARGS
|
||||||
|
-DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
|
||||||
|
-DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
|
||||||
|
# ninja need to know where openblas.lib comes from
|
||||||
|
BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
|
||||||
|
set(OPENBLAS_SHARED_LIB
|
||||||
|
${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
|
||||||
|
|
||||||
|
add_library(openblas INTERFACE)
|
||||||
|
add_dependencies(openblas extern_openblas)
|
||||||
|
include_directories(${CBLAS_INC_DIR})
|
||||||
|
link_libraries(${CBLAS_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
|
@ -0,0 +1,42 @@
|
|||||||
|
#the pybind11 is from:https://github.com/pybind/pybind11
|
||||||
|
# Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
|
||||||
|
|
||||||
|
SET(PYBIND_ZIP "v2.10.0.zip")
|
||||||
|
SET(LOCAL_PYBIND_ZIP ${FETCHCONTENT_BASE_DIR}/${PYBIND_ZIP})
|
||||||
|
SET(PYBIND_SRC ${FETCHCONTENT_BASE_DIR}/pybind11)
|
||||||
|
SET(DOWNLOAD_URL "https://paddleaudio.bj.bcebos.com/build/v2.10.0.zip")
|
||||||
|
SET(PYBIND_TIMEOUT 600 CACHE STRING "Timeout in seconds when downloading pybind.")
|
||||||
|
|
||||||
|
IF(NOT EXISTS ${LOCAL_PYBIND_ZIP})
|
||||||
|
FILE(DOWNLOAD ${DOWNLOAD_URL}
|
||||||
|
${LOCAL_PYBIND_ZIP}
|
||||||
|
TIMEOUT ${PYBIND_TIMEOUT}
|
||||||
|
STATUS ERR
|
||||||
|
SHOW_PROGRESS
|
||||||
|
)
|
||||||
|
|
||||||
|
IF(ERR EQUAL 0)
|
||||||
|
MESSAGE(STATUS "download pybind success")
|
||||||
|
ELSE()
|
||||||
|
MESSAGE(FATAL_ERROR "download pybind fail")
|
||||||
|
ENDIF()
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
IF(NOT EXISTS ${PYBIND_SRC})
|
||||||
|
EXECUTE_PROCESS(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E tar xfz ${LOCAL_PYBIND_ZIP}
|
||||||
|
WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}
|
||||||
|
RESULT_VARIABLE tar_result
|
||||||
|
)
|
||||||
|
|
||||||
|
file(RENAME ${FETCHCONTENT_BASE_DIR}/pybind11-2.10.0 ${PYBIND_SRC})
|
||||||
|
|
||||||
|
IF (tar_result MATCHES 0)
|
||||||
|
MESSAGE(STATUS "unzip pybind success")
|
||||||
|
ELSE()
|
||||||
|
MESSAGE(FATAL_ERROR "unzip pybind fail")
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
include_directories(${PYBIND_SRC}/include)
|
@ -0,0 +1,45 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
# Prints accumulated ONNX configuration summary
|
||||||
|
function (onnx_print_configuration_summary)
|
||||||
|
message(STATUS "")
|
||||||
|
message(STATUS "******** Summary ********")
|
||||||
|
message(STATUS " CMake version : ${CMAKE_VERSION}")
|
||||||
|
message(STATUS " CMake command : ${CMAKE_COMMAND}")
|
||||||
|
message(STATUS " System : ${CMAKE_SYSTEM_NAME}")
|
||||||
|
message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}")
|
||||||
|
message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
|
||||||
|
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}")
|
||||||
|
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
|
||||||
|
get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
|
||||||
|
message(STATUS " Compile definitions : ${tmp}")
|
||||||
|
message(STATUS " CMAKE_PREFIX_PATH : ${CMAKE_PREFIX_PATH}")
|
||||||
|
message(STATUS " CMAKE_INSTALL_PREFIX : ${CMAKE_INSTALL_PREFIX}")
|
||||||
|
message(STATUS " CMAKE_MODULE_PATH : ${CMAKE_MODULE_PATH}")
|
||||||
|
message(STATUS "")
|
||||||
|
message(STATUS " ONNX version : ${ONNX_VERSION}")
|
||||||
|
message(STATUS " ONNX NAMESPACE : ${ONNX_NAMESPACE}")
|
||||||
|
message(STATUS " ONNX_USE_LITE_PROTO : ${ONNX_USE_LITE_PROTO}")
|
||||||
|
message(STATUS " USE_PROTOBUF_SHARED_LIBS : ${ONNX_USE_PROTOBUF_SHARED_LIBS}")
|
||||||
|
message(STATUS " Protobuf_USE_STATIC_LIBS : ${Protobuf_USE_STATIC_LIBS}")
|
||||||
|
message(STATUS " ONNX_DISABLE_EXCEPTIONS : ${ONNX_DISABLE_EXCEPTIONS}")
|
||||||
|
message(STATUS " ONNX_WERROR : ${ONNX_WERROR}")
|
||||||
|
message(STATUS " ONNX_BUILD_TESTS : ${ONNX_BUILD_TESTS}")
|
||||||
|
message(STATUS " ONNX_BUILD_BENCHMARKS : ${ONNX_BUILD_BENCHMARKS}")
|
||||||
|
message(STATUS " ONNXIFI_DUMMY_BACKEND : ${ONNXIFI_DUMMY_BACKEND}")
|
||||||
|
message(STATUS " ONNXIFI_ENABLE_EXT : ${ONNXIFI_ENABLE_EXT}")
|
||||||
|
message(STATUS "")
|
||||||
|
message(STATUS " Protobuf compiler : ${PROTOBUF_PROTOC_EXECUTABLE}")
|
||||||
|
message(STATUS " Protobuf includes : ${PROTOBUF_INCLUDE_DIRS}")
|
||||||
|
message(STATUS " Protobuf libraries : ${PROTOBUF_LIBRARIES}")
|
||||||
|
message(STATUS " BUILD_ONNX_PYTHON : ${BUILD_ONNX_PYTHON}")
|
||||||
|
message(STATUS " Python version : ${Python_VERSION}")
|
||||||
|
message(STATUS " Python executable : ${Python_EXECUTABLE}")
|
||||||
|
message(STATUS " Python includes : ${Python_INCLUDE_DIR}")
|
||||||
|
message(STATUS " Python libraries : ${Python_LIBRARY}")
|
||||||
|
message(STATUS " PYBIND11 : ${pybind11_FOUND}")
|
||||||
|
message(STATUS " Pybind11 version : ${pybind11_VERSION}")
|
||||||
|
message(STATUS " Pybind11 include : ${pybind11_INCLUDE_DIR}")
|
||||||
|
message(STATUS " Pybind11 includes : ${pybind11_INCLUDE_DIRS}")
|
||||||
|
message(STATUS " Pybind11 libraries : ${pybind11_LIBRARIES}")
|
||||||
|
endfunction()
|
@ -0,0 +1,19 @@
|
|||||||
|
|
||||||
|
add_subdirectory(third_party)
|
||||||
|
add_subdirectory(src)
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
|
||||||
|
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
|
||||||
|
endif(APPLE)
|
||||||
|
|
||||||
|
if (UNIX AND NOT APPLE)
|
||||||
|
file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
|
||||||
|
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
|
||||||
|
|
||||||
|
file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
|
||||||
|
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
|
||||||
|
|
||||||
|
file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
|
||||||
|
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
|
||||||
|
endif()
|
@ -0,0 +1,167 @@
|
|||||||
|
import contextlib
|
||||||
|
import ctypes
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ._internal import module_utils as _mod_utils # noqa: F401
|
||||||
|
|
||||||
|
# Query `hasattr` only once.
|
||||||
|
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
|
||||||
|
'setdlopenflags')
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def dl_open_guard():
|
||||||
|
"""
|
||||||
|
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
|
||||||
|
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
|
||||||
|
shared library to load custom operators.
|
||||||
|
"""
|
||||||
|
if _SET_GLOBAL_FLAGS:
|
||||||
|
old_flags = sys.getdlopenflags()
|
||||||
|
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
|
||||||
|
yield
|
||||||
|
if _SET_GLOBAL_FLAGS:
|
||||||
|
sys.setdlopenflags(old_flags)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_library_path(path: str) -> str:
|
||||||
|
return os.path.realpath(path)
|
||||||
|
|
||||||
|
|
||||||
|
class _Ops(types.ModuleType):
|
||||||
|
#__file__ = '_ops.py'
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(_Ops, self).__init__('paddleaudio.ops')
|
||||||
|
self.loaded_libraries = set()
|
||||||
|
|
||||||
|
def load_library(self, path):
|
||||||
|
"""
|
||||||
|
Loads a shared library from the given path into the current process.
|
||||||
|
This allows dynamically loading custom operators. For this,
|
||||||
|
you should compile your operator and
|
||||||
|
the static registration code into a shared library object, and then
|
||||||
|
call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
|
||||||
|
shared object.
|
||||||
|
After the library is loaded, it is added to the
|
||||||
|
``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
|
||||||
|
for the paths of all libraries loaded using this function.
|
||||||
|
Args:
|
||||||
|
path (str): A path to a shared library to load.
|
||||||
|
"""
|
||||||
|
path = resolve_library_path(path)
|
||||||
|
with dl_open_guard():
|
||||||
|
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
|
||||||
|
# Import the shared library into the process, thus running its
|
||||||
|
# static (global) initialization code in order to register custom
|
||||||
|
# operators with the JIT.
|
||||||
|
ctypes.CDLL(path)
|
||||||
|
self.loaded_libraries.add(path)
|
||||||
|
|
||||||
|
|
||||||
|
_LIB_DIR = Path(__file__).parent / "lib"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_lib_path(lib: str):
|
||||||
|
suffix = "pyd" if os.name == "nt" else "so"
|
||||||
|
path = _LIB_DIR / f"{lib}.{suffix}"
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _load_lib(lib: str) -> bool:
|
||||||
|
"""Load extension module
|
||||||
|
Note:
|
||||||
|
In case `paddleaudio` is deployed with `pex` format, the library file
|
||||||
|
is not in a standard location.
|
||||||
|
In this case, we expect that `libpaddlleaudio` is available somewhere
|
||||||
|
in the search path of dynamic loading mechanism, so that importing
|
||||||
|
`_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
|
||||||
|
This is the reason why the function should not raising an error when the library
|
||||||
|
file is not found.
|
||||||
|
Returns:
|
||||||
|
bool:
|
||||||
|
True if the library file is found AND the library loaded without failure.
|
||||||
|
False if the library file is not found (like in the case where paddlleaudio
|
||||||
|
is deployed with pex format, thus the shared library file is
|
||||||
|
in a non-standard location.).
|
||||||
|
If the library file is found but there is an issue loading the library,
|
||||||
|
(such as missing dependency) then this function raises the exception as-is.
|
||||||
|
Raises:
|
||||||
|
Exception:
|
||||||
|
If the library file is found, but there is an issue loading the library file,
|
||||||
|
(when underlying `ctype.DLL` throws an exception), this function will pass
|
||||||
|
the exception as-is, instead of catching it and returning bool.
|
||||||
|
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
|
||||||
|
is not found.
|
||||||
|
This behavior was chosen because the expected failure case is not recoverable.
|
||||||
|
If a dependency is missing, then users have to install it.
|
||||||
|
"""
|
||||||
|
path = _get_lib_path(lib)
|
||||||
|
if not path.exists():
|
||||||
|
warnings.warn("lib path is not exists:" + str(path))
|
||||||
|
return False
|
||||||
|
ops.load_library(path)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
_FFMPEG_INITIALIZED = False
|
||||||
|
|
||||||
|
|
||||||
|
def _init_ffmpeg():
|
||||||
|
global _FFMPEG_INITIALIZED
|
||||||
|
if _FFMPEG_INITIALIZED:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not paddleaudio._paddlleaudio.is_ffmpeg_available():
|
||||||
|
raise RuntimeError(
|
||||||
|
"paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
_load_lib("libpaddlleaudio_ffmpeg")
|
||||||
|
except OSError as err:
|
||||||
|
raise ImportError(
|
||||||
|
"FFmpeg libraries are not found. Please install FFmpeg.") from err
|
||||||
|
|
||||||
|
import paddllespeech.audio._paddlleaudio_ffmpeg # noqa
|
||||||
|
|
||||||
|
paddleaudio._paddlleaudio.ffmpeg_init()
|
||||||
|
if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
|
||||||
|
paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
|
||||||
|
|
||||||
|
_FFMPEG_INITIALIZED = True
|
||||||
|
|
||||||
|
|
||||||
|
def _init_extension():
|
||||||
|
if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
|
||||||
|
warnings.warn(
|
||||||
|
"paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
|
||||||
|
return
|
||||||
|
|
||||||
|
_load_lib("libpaddleaudio")
|
||||||
|
# This import is for initializing the methods registered via PyBind11
|
||||||
|
# This has to happen after the base library is loaded
|
||||||
|
try:
|
||||||
|
from paddleaudio import _paddleaudio # noqa
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(
|
||||||
|
"paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Because this part is executed as part of `import torchaudio`, we ignore the
|
||||||
|
# initialization failure.
|
||||||
|
# If the FFmpeg integration is not properly initialized, then detailed error
|
||||||
|
# will be raised when client code attempts to import the dedicated feature.
|
||||||
|
try:
|
||||||
|
_init_ffmpeg()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
ops = _Ops()
|
||||||
|
|
||||||
|
_init_extension()
|
@ -0,0 +1,151 @@
|
|||||||
|
import importlib.util
|
||||||
|
import platform
|
||||||
|
import warnings
|
||||||
|
from functools import wraps
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
|
||||||
|
|
||||||
|
|
||||||
|
def is_module_available(*modules: str) -> bool:
|
||||||
|
r"""Returns if a top-level module with :attr:`name` exists *without**
|
||||||
|
importing it. This is generally safer than try-catch block around a
|
||||||
|
`import X`. It avoids third party libraries breaking assumptions of some of
|
||||||
|
our tests, e.g., setting multiprocessing start method when imported
|
||||||
|
(see librosa/#747, torchvision/#544).
|
||||||
|
"""
|
||||||
|
return all(importlib.util.find_spec(m) is not None for m in modules)
|
||||||
|
|
||||||
|
|
||||||
|
def requires_module(*modules: str):
|
||||||
|
"""Decorate function to give error message if invoked without required optional modules.
|
||||||
|
This decorator is to give better error message to users rather
|
||||||
|
than raising ``NameError: name 'module' is not defined`` at random places.
|
||||||
|
"""
|
||||||
|
missing = [m for m in modules if not is_module_available(m)]
|
||||||
|
|
||||||
|
if not missing:
|
||||||
|
# fall through. If all the modules are available, no need to decorate
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
|
||||||
|
else:
|
||||||
|
req = f"module: {missing[0]}" if len(
|
||||||
|
missing) == 1 else f"modules: {missing}"
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires {req}")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def deprecated(direction: str, version: Optional[str]=None):
|
||||||
|
"""Decorator to add deprecation message
|
||||||
|
Args:
|
||||||
|
direction (str): Migration steps to be given to users.
|
||||||
|
version (str or int): The version when the object will be removed
|
||||||
|
"""
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
message = (
|
||||||
|
f"{func.__module__}.{func.__name__} has been deprecated "
|
||||||
|
f'and will be removed from {"future" if version is None else version} release. '
|
||||||
|
f"{direction}")
|
||||||
|
warnings.warn(message, stacklevel=2)
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def is_kaldi_available():
|
||||||
|
return is_module_available("paddleaudio._paddleaudio")
|
||||||
|
|
||||||
|
|
||||||
|
def requires_kaldi():
|
||||||
|
if is_kaldi_available():
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires libpaddleaudio build with kaldi")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def _check_soundfile_importable():
|
||||||
|
if not is_module_available("soundfile"):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
import soundfile # noqa: F401
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(
|
||||||
|
"Failed to import soundfile. 'soundfile' backend is not available.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
_is_soundfile_importable = _check_soundfile_importable()
|
||||||
|
|
||||||
|
|
||||||
|
def is_soundfile_available():
|
||||||
|
return _is_soundfile_importable
|
||||||
|
|
||||||
|
|
||||||
|
def requires_soundfile():
|
||||||
|
if is_soundfile_available():
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
else:
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires soundfile")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def is_sox_available():
|
||||||
|
if platform.system() == "Windows": # not support sox in windows
|
||||||
|
return False
|
||||||
|
return is_module_available("paddleaudio._paddleaudio")
|
||||||
|
|
||||||
|
|
||||||
|
def requires_sox():
|
||||||
|
if is_sox_available():
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
else:
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires libpaddleaudio build with sox")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
@ -0,0 +1,55 @@
|
|||||||
|
# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
|
||||||
|
|
||||||
|
class AudioInfo:
|
||||||
|
"""return of info function.
|
||||||
|
|
||||||
|
This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
|
||||||
|
:ref:`"soundfile" backend with the new interface<soundfile_backend>`.
|
||||||
|
|
||||||
|
:ivar int sample_rate: Sample rate
|
||||||
|
:ivar int num_frames: The number of frames
|
||||||
|
:ivar int num_channels: The number of channels
|
||||||
|
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
|
||||||
|
or when it cannot be accurately inferred.
|
||||||
|
:ivar str encoding: Audio encoding
|
||||||
|
The values encoding can take are one of the following:
|
||||||
|
|
||||||
|
* ``PCM_S``: Signed integer linear PCM
|
||||||
|
* ``PCM_U``: Unsigned integer linear PCM
|
||||||
|
* ``PCM_F``: Floating point linear PCM
|
||||||
|
* ``FLAC``: Flac, Free Lossless Audio Codec
|
||||||
|
* ``ULAW``: Mu-law
|
||||||
|
* ``ALAW``: A-law
|
||||||
|
* ``MP3`` : MP3, MPEG-1 Audio Layer III
|
||||||
|
* ``VORBIS``: OGG Vorbis
|
||||||
|
* ``AMR_WB``: Adaptive Multi-Rate
|
||||||
|
* ``AMR_NB``: Adaptive Multi-Rate Wideband
|
||||||
|
* ``OPUS``: Opus
|
||||||
|
* ``HTK``: Single channel 16-bit PCM
|
||||||
|
* ``UNKNOWN`` : None of above
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sample_rate: int,
|
||||||
|
num_frames: int,
|
||||||
|
num_channels: int,
|
||||||
|
bits_per_sample: int,
|
||||||
|
encoding: str,
|
||||||
|
):
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.num_frames = num_frames
|
||||||
|
self.num_channels = num_channels
|
||||||
|
self.bits_per_sample = bits_per_sample
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return (
|
||||||
|
f"AudioMetaData("
|
||||||
|
f"sample_rate={self.sample_rate}, "
|
||||||
|
f"num_frames={self.num_frames}, "
|
||||||
|
f"num_channels={self.num_channels}, "
|
||||||
|
f"bits_per_sample={self.bits_per_sample}, "
|
||||||
|
f"encoding={self.encoding}"
|
||||||
|
f")"
|
||||||
|
)
|
@ -0,0 +1,32 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
|
||||||
|
|
||||||
|
|
||||||
|
def load(
|
||||||
|
filepath: Union[str, Path],
|
||||||
|
out: Optional[Tensor]=None,
|
||||||
|
normalization: Union[bool, float, Callable]=True,
|
||||||
|
channels_first: bool=True,
|
||||||
|
num_frames: int=0,
|
||||||
|
offset: int=0,
|
||||||
|
filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||||
|
raise RuntimeError("No audio I/O backend is available.")
|
||||||
|
|
||||||
|
|
||||||
|
def save(filepath: str,
|
||||||
|
src: Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
precision: int=16,
|
||||||
|
channels_first: bool=True) -> None:
|
||||||
|
raise RuntimeError("No audio I/O backend is available.")
|
||||||
|
|
||||||
|
|
||||||
|
def info(filepath: str) -> None:
|
||||||
|
raise RuntimeError("No audio I/O backend is available.")
|
@ -0,0 +1,677 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import resampy
|
||||||
|
import soundfile
|
||||||
|
from scipy.io import wavfile
|
||||||
|
|
||||||
|
from ..utils import depth_convert
|
||||||
|
from ..utils import ParameterError
|
||||||
|
from .common import AudioInfo
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'resample',
|
||||||
|
'to_mono',
|
||||||
|
'normalize',
|
||||||
|
'save',
|
||||||
|
'soundfile_save',
|
||||||
|
'load',
|
||||||
|
'soundfile_load',
|
||||||
|
'info',
|
||||||
|
]
|
||||||
|
NORMALMIZE_TYPES = ['linear', 'gaussian']
|
||||||
|
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
|
||||||
|
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
|
||||||
|
EPS = 1e-8
|
||||||
|
|
||||||
|
|
||||||
|
def resample(y: np.ndarray,
|
||||||
|
src_sr: int,
|
||||||
|
target_sr: int,
|
||||||
|
mode: str='kaiser_fast') -> np.ndarray:
|
||||||
|
"""Audio resampling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
src_sr (int): Source sample rate.
|
||||||
|
target_sr (int): Target sample rate.
|
||||||
|
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` resampled to `target_sr`
|
||||||
|
"""
|
||||||
|
|
||||||
|
if mode == 'kaiser_best':
|
||||||
|
warnings.warn(
|
||||||
|
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
|
||||||
|
we recommend the mode kaiser_fast in large scale audio trainning')
|
||||||
|
|
||||||
|
if not isinstance(y, np.ndarray):
|
||||||
|
raise ParameterError(
|
||||||
|
'Only support numpy np.ndarray, but received y in {type(y)}')
|
||||||
|
|
||||||
|
if mode not in RESAMPLE_MODES:
|
||||||
|
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
|
||||||
|
|
||||||
|
return resampy.resample(y, src_sr, target_sr, filter=mode)
|
||||||
|
|
||||||
|
|
||||||
|
def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
|
||||||
|
"""Convert sterior audio to mono.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` with mono channel.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if merge_type not in MERGE_TYPES:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
|
||||||
|
)
|
||||||
|
if y.ndim > 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
|
||||||
|
if y.ndim == 1: # nothing to merge
|
||||||
|
return y
|
||||||
|
|
||||||
|
if merge_type == 'ch0':
|
||||||
|
return y[0]
|
||||||
|
if merge_type == 'ch1':
|
||||||
|
return y[1]
|
||||||
|
if merge_type == 'random':
|
||||||
|
return y[np.random.randint(0, 2)]
|
||||||
|
|
||||||
|
# need to do averaging according to dtype
|
||||||
|
|
||||||
|
if y.dtype == 'float32':
|
||||||
|
y_out = (y[0] + y[1]) * 0.5
|
||||||
|
elif y.dtype == 'int16':
|
||||||
|
y_out = y.astype('int32')
|
||||||
|
y_out = (y_out[0] + y_out[1]) // 2
|
||||||
|
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||||
|
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||||
|
|
||||||
|
elif y.dtype == 'int8':
|
||||||
|
y_out = y.astype('int16')
|
||||||
|
y_out = (y_out[0] + y_out[1]) // 2
|
||||||
|
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||||
|
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||||
|
else:
|
||||||
|
raise ParameterError(f'Unsupported dtype: {y.dtype}')
|
||||||
|
return y_out
|
||||||
|
|
||||||
|
|
||||||
|
def soundfile_load_(file: os.PathLike,
|
||||||
|
offset: Optional[float]=None,
|
||||||
|
dtype: str='int16',
|
||||||
|
duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
|
||||||
|
"""Load audio using soundfile library. This function load audio file using libsndfile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (os.PathLike): File of waveform.
|
||||||
|
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
|
||||||
|
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
|
||||||
|
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
|
||||||
|
"""
|
||||||
|
with soundfile.SoundFile(file) as sf_desc:
|
||||||
|
sr_native = sf_desc.samplerate
|
||||||
|
if offset:
|
||||||
|
sf_desc.seek(int(offset * sr_native))
|
||||||
|
if duration is not None:
|
||||||
|
frame_duration = int(duration * sr_native)
|
||||||
|
else:
|
||||||
|
frame_duration = -1
|
||||||
|
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
|
||||||
|
|
||||||
|
return y, sf_desc.samplerate
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(y: np.ndarray, norm_type: str='linear',
|
||||||
|
mul_factor: float=1.0) -> np.ndarray:
|
||||||
|
"""Normalize an input audio with additional multiplier.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
|
||||||
|
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` after normalization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if norm_type == 'linear':
|
||||||
|
amax = np.max(np.abs(y))
|
||||||
|
factor = 1.0 / (amax + EPS)
|
||||||
|
y = y * factor * mul_factor
|
||||||
|
elif norm_type == 'gaussian':
|
||||||
|
amean = np.mean(y)
|
||||||
|
astd = np.std(y)
|
||||||
|
astd = max(astd, EPS)
|
||||||
|
y = mul_factor * (y - amean) / astd
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
|
||||||
|
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
|
||||||
|
"""Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
sr (int): Sample rate.
|
||||||
|
file (os.PathLike): Path of auido file to save.
|
||||||
|
"""
|
||||||
|
if not file.endswith('.wav'):
|
||||||
|
raise ParameterError(
|
||||||
|
f'only .wav file supported, but dst file name is: {file}')
|
||||||
|
|
||||||
|
if sr <= 0:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Sample rate should be larger than 0, recieved sr = {sr}')
|
||||||
|
|
||||||
|
if y.dtype not in ['int16', 'int8']:
|
||||||
|
warnings.warn(
|
||||||
|
f'input data type is {y.dtype}, will convert data to int16 format before saving'
|
||||||
|
)
|
||||||
|
y_out = depth_convert(y, 'int16')
|
||||||
|
else:
|
||||||
|
y_out = y
|
||||||
|
|
||||||
|
wavfile.write(file, sr, y_out)
|
||||||
|
|
||||||
|
|
||||||
|
def soundfile_load(
|
||||||
|
file: os.PathLike,
|
||||||
|
sr: Optional[int]=None,
|
||||||
|
mono: bool=True,
|
||||||
|
merge_type: str='average', # ch0,ch1,random,average
|
||||||
|
normal: bool=True,
|
||||||
|
norm_type: str='linear',
|
||||||
|
norm_mul_factor: float=1.0,
|
||||||
|
offset: float=0.0,
|
||||||
|
duration: Optional[int]=None,
|
||||||
|
dtype: str='float32',
|
||||||
|
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
|
||||||
|
"""Load audio file from disk. This function loads audio from disk using using audio beackend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (os.PathLike): Path of auido file to load.
|
||||||
|
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
|
||||||
|
mono (bool, optional): Return waveform with mono channel. Defaults to True.
|
||||||
|
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
|
||||||
|
normal (bool, optional): Waveform normalization. Defaults to True.
|
||||||
|
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
|
||||||
|
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
|
||||||
|
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
|
||||||
|
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
|
||||||
|
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
|
||||||
|
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
|
||||||
|
"""
|
||||||
|
|
||||||
|
y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
|
||||||
|
|
||||||
|
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
|
||||||
|
raise ParameterError(f'audio file {file} looks empty')
|
||||||
|
|
||||||
|
if mono:
|
||||||
|
y = to_mono(y, merge_type)
|
||||||
|
|
||||||
|
if sr is not None and sr != r:
|
||||||
|
y = resample(y, r, sr, mode=resample_mode)
|
||||||
|
r = sr
|
||||||
|
|
||||||
|
if normal:
|
||||||
|
y = normalize(y, norm_type, norm_mul_factor)
|
||||||
|
elif dtype in ['int8', 'int16']:
|
||||||
|
# still need to do normalization, before depth convertion
|
||||||
|
y = normalize(y, 'linear', 1.0)
|
||||||
|
|
||||||
|
y = depth_convert(y, dtype)
|
||||||
|
return y, r
|
||||||
|
|
||||||
|
|
||||||
|
#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
|
||||||
|
|
||||||
|
|
||||||
|
def _get_subtype_for_wav(dtype: paddle.dtype,
|
||||||
|
encoding: str,
|
||||||
|
bits_per_sample: int):
|
||||||
|
if not encoding:
|
||||||
|
if not bits_per_sample:
|
||||||
|
subtype = {
|
||||||
|
paddle.uint8: "PCM_U8",
|
||||||
|
paddle.int16: "PCM_16",
|
||||||
|
paddle.int32: "PCM_32",
|
||||||
|
paddle.float32: "FLOAT",
|
||||||
|
paddle.float64: "DOUBLE",
|
||||||
|
}.get(dtype)
|
||||||
|
if not subtype:
|
||||||
|
raise ValueError(f"Unsupported dtype for wav: {dtype}")
|
||||||
|
return subtype
|
||||||
|
if bits_per_sample == 8:
|
||||||
|
return "PCM_U8"
|
||||||
|
return f"PCM_{bits_per_sample}"
|
||||||
|
if encoding == "PCM_S":
|
||||||
|
if not bits_per_sample:
|
||||||
|
return "PCM_32"
|
||||||
|
if bits_per_sample == 8:
|
||||||
|
raise ValueError("wav does not support 8-bit signed PCM encoding.")
|
||||||
|
return f"PCM_{bits_per_sample}"
|
||||||
|
if encoding == "PCM_U":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "PCM_U8"
|
||||||
|
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
|
||||||
|
if encoding == "PCM_F":
|
||||||
|
if bits_per_sample in (None, 32):
|
||||||
|
return "FLOAT"
|
||||||
|
if bits_per_sample == 64:
|
||||||
|
return "DOUBLE"
|
||||||
|
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
|
||||||
|
if encoding == "ULAW":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "ULAW"
|
||||||
|
raise ValueError("wav only supports 8-bit mu-law encoding.")
|
||||||
|
if encoding == "ALAW":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "ALAW"
|
||||||
|
raise ValueError("wav only supports 8-bit a-law encoding.")
|
||||||
|
raise ValueError(f"wav does not support {encoding}.")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
|
||||||
|
if encoding in (None, "PCM_S"):
|
||||||
|
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
|
||||||
|
if encoding in ("PCM_U", "PCM_F"):
|
||||||
|
raise ValueError(f"sph does not support {encoding} encoding.")
|
||||||
|
if encoding == "ULAW":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "ULAW"
|
||||||
|
raise ValueError("sph only supports 8-bit for mu-law encoding.")
|
||||||
|
if encoding == "ALAW":
|
||||||
|
return "ALAW"
|
||||||
|
raise ValueError(f"sph does not support {encoding}.")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_subtype(dtype: paddle.dtype,
|
||||||
|
format: str,
|
||||||
|
encoding: str,
|
||||||
|
bits_per_sample: int):
|
||||||
|
if format == "wav":
|
||||||
|
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
|
||||||
|
if format == "flac":
|
||||||
|
if encoding:
|
||||||
|
raise ValueError("flac does not support encoding.")
|
||||||
|
if not bits_per_sample:
|
||||||
|
return "PCM_16"
|
||||||
|
if bits_per_sample > 24:
|
||||||
|
raise ValueError("flac does not support bits_per_sample > 24.")
|
||||||
|
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
|
||||||
|
if format in ("ogg", "vorbis"):
|
||||||
|
if encoding or bits_per_sample:
|
||||||
|
raise ValueError(
|
||||||
|
"ogg/vorbis does not support encoding/bits_per_sample.")
|
||||||
|
return "VORBIS"
|
||||||
|
if format == "sph":
|
||||||
|
return _get_subtype_for_sphere(encoding, bits_per_sample)
|
||||||
|
if format in ("nis", "nist"):
|
||||||
|
return "PCM_16"
|
||||||
|
raise ValueError(f"Unsupported format: {format}")
|
||||||
|
|
||||||
|
|
||||||
|
def save(
|
||||||
|
filepath: str,
|
||||||
|
src: paddle.Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
channels_first: bool=True,
|
||||||
|
compression: Optional[float]=None,
|
||||||
|
format: Optional[str]=None,
|
||||||
|
encoding: Optional[str]=None,
|
||||||
|
bits_per_sample: Optional[int]=None, ):
|
||||||
|
"""Save audio data to file.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
The formats this function can handle depend on the soundfile installation.
|
||||||
|
This function is tested on the following formats;
|
||||||
|
|
||||||
|
* WAV
|
||||||
|
|
||||||
|
* 32-bit floating-point
|
||||||
|
* 32-bit signed integer
|
||||||
|
* 16-bit signed integer
|
||||||
|
* 8-bit unsigned integer
|
||||||
|
|
||||||
|
* FLAC
|
||||||
|
* OGG/VORBIS
|
||||||
|
* SPHERE
|
||||||
|
|
||||||
|
Note:
|
||||||
|
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||||
|
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath (str or pathlib.Path): Path to audio file.
|
||||||
|
src (paddle.Tensor): Audio data to save. must be 2D tensor.
|
||||||
|
sample_rate (int): sampling rate
|
||||||
|
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
||||||
|
otherwise `[time, channel]`.
|
||||||
|
compression (float of None, optional): Not used.
|
||||||
|
It is here only for interface compatibility reson with "sox_io" backend.
|
||||||
|
format (str or None, optional): Override the audio format.
|
||||||
|
When ``filepath`` argument is path-like object, audio format is
|
||||||
|
inferred from file extension. If the file extension is missing or
|
||||||
|
different, you can specify the correct format with this argument.
|
||||||
|
|
||||||
|
When ``filepath`` argument is file-like object,
|
||||||
|
this argument is required.
|
||||||
|
|
||||||
|
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
|
||||||
|
``"flac"`` and ``"sph"``.
|
||||||
|
encoding (str or None, optional): Changes the encoding for supported formats.
|
||||||
|
This argument is effective only for supported formats, sush as
|
||||||
|
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
|
||||||
|
|
||||||
|
- ``"PCM_S"`` (signed integer Linear PCM)
|
||||||
|
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
||||||
|
- ``"PCM_F"`` (floating point PCM)
|
||||||
|
- ``"ULAW"`` (mu-law)
|
||||||
|
- ``"ALAW"`` (a-law)
|
||||||
|
|
||||||
|
bits_per_sample (int or None, optional): Changes the bit depth for the
|
||||||
|
supported formats.
|
||||||
|
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
|
||||||
|
you can change the bit depth.
|
||||||
|
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
|
||||||
|
|
||||||
|
Supported formats/encodings/bit depth/compression are:
|
||||||
|
|
||||||
|
``"wav"``
|
||||||
|
- 32-bit floating-point PCM
|
||||||
|
- 32-bit signed integer PCM
|
||||||
|
- 24-bit signed integer PCM
|
||||||
|
- 16-bit signed integer PCM
|
||||||
|
- 8-bit unsigned integer PCM
|
||||||
|
- 8-bit mu-law
|
||||||
|
- 8-bit a-law
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Default encoding/bit depth is determined by the dtype of
|
||||||
|
the input Tensor.
|
||||||
|
|
||||||
|
``"flac"``
|
||||||
|
- 8-bit
|
||||||
|
- 16-bit (default)
|
||||||
|
- 24-bit
|
||||||
|
|
||||||
|
``"ogg"``, ``"vorbis"``
|
||||||
|
- Doesn't accept changing configuration.
|
||||||
|
|
||||||
|
``"sph"``
|
||||||
|
- 8-bit signed integer PCM
|
||||||
|
- 16-bit signed integer PCM
|
||||||
|
- 24-bit signed integer PCM
|
||||||
|
- 32-bit signed integer PCM (default)
|
||||||
|
- 8-bit mu-law
|
||||||
|
- 8-bit a-law
|
||||||
|
- 16-bit a-law
|
||||||
|
- 24-bit a-law
|
||||||
|
- 32-bit a-law
|
||||||
|
|
||||||
|
"""
|
||||||
|
if src.ndim != 2:
|
||||||
|
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
|
||||||
|
if compression is not None:
|
||||||
|
warnings.warn(
|
||||||
|
'`save` function of "soundfile" backend does not support "compression" parameter. '
|
||||||
|
"The argument is silently ignored.")
|
||||||
|
if hasattr(filepath, "write"):
|
||||||
|
if format is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"`format` is required when saving to file object.")
|
||||||
|
ext = format.lower()
|
||||||
|
else:
|
||||||
|
ext = str(filepath).split(".")[-1].lower()
|
||||||
|
|
||||||
|
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
|
||||||
|
raise ValueError("Invalid bits_per_sample.")
|
||||||
|
if bits_per_sample == 24:
|
||||||
|
warnings.warn(
|
||||||
|
"Saving audio with 24 bits per sample might warp samples near -1. "
|
||||||
|
"Using 16 bits per sample might be able to avoid this.")
|
||||||
|
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
|
||||||
|
|
||||||
|
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
|
||||||
|
# so we extend the extensions manually here
|
||||||
|
if ext in ["nis", "nist", "sph"] and format is None:
|
||||||
|
format = "NIST"
|
||||||
|
|
||||||
|
if channels_first:
|
||||||
|
src = src.t()
|
||||||
|
|
||||||
|
soundfile.write(
|
||||||
|
file=filepath,
|
||||||
|
data=src,
|
||||||
|
samplerate=sample_rate,
|
||||||
|
subtype=subtype,
|
||||||
|
format=format)
|
||||||
|
|
||||||
|
|
||||||
|
_SUBTYPE2DTYPE = {
|
||||||
|
"PCM_S8": "int8",
|
||||||
|
"PCM_U8": "uint8",
|
||||||
|
"PCM_16": "int16",
|
||||||
|
"PCM_32": "int32",
|
||||||
|
"FLOAT": "float32",
|
||||||
|
"DOUBLE": "float64",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load(
|
||||||
|
filepath: str,
|
||||||
|
frame_offset: int=0,
|
||||||
|
num_frames: int=-1,
|
||||||
|
normalize: bool=True,
|
||||||
|
channels_first: bool=True,
|
||||||
|
format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
|
||||||
|
"""Load audio data from file.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
The formats this function can handle depend on the soundfile installation.
|
||||||
|
This function is tested on the following formats;
|
||||||
|
|
||||||
|
* WAV
|
||||||
|
|
||||||
|
* 32-bit floating-point
|
||||||
|
* 32-bit signed integer
|
||||||
|
* 16-bit signed integer
|
||||||
|
* 8-bit unsigned integer
|
||||||
|
|
||||||
|
* FLAC
|
||||||
|
* OGG/VORBIS
|
||||||
|
* SPHERE
|
||||||
|
|
||||||
|
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
||||||
|
``float32`` dtype and the shape of `[channel, time]`.
|
||||||
|
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
|
||||||
|
|
||||||
|
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
||||||
|
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
|
||||||
|
by providing ``normalize=False``, this function can return integer Tensor, where the samples
|
||||||
|
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
|
||||||
|
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
|
||||||
|
|
||||||
|
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
|
||||||
|
``flac`` and ``mp3``.
|
||||||
|
For these formats, this function always returns ``float32`` Tensor with values normalized to
|
||||||
|
``[-1.0, 1.0]``.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||||
|
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath (path-like object or file-like object):
|
||||||
|
Source of audio data.
|
||||||
|
frame_offset (int, optional):
|
||||||
|
Number of frames to skip before start reading data.
|
||||||
|
num_frames (int, optional):
|
||||||
|
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
||||||
|
starting from ``frame_offset``.
|
||||||
|
This function may return the less number of frames if there is not enough
|
||||||
|
frames in the given file.
|
||||||
|
normalize (bool, optional):
|
||||||
|
When ``True``, this function always return ``float32``, and sample values are
|
||||||
|
normalized to ``[-1.0, 1.0]``.
|
||||||
|
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
||||||
|
integer type.
|
||||||
|
This argument has no effect for formats other than integer WAV type.
|
||||||
|
channels_first (bool, optional):
|
||||||
|
When True, the returned Tensor has dimension `[channel, time]`.
|
||||||
|
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
||||||
|
format (str or None, optional):
|
||||||
|
Not used. PySoundFile does not accept format hint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(paddle.Tensor, int): Resulting Tensor and sample rate.
|
||||||
|
If the input file has integer wav format and normalization is off, then it has
|
||||||
|
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
||||||
|
`[channel, time]` else `[time, channel]`.
|
||||||
|
"""
|
||||||
|
with soundfile.SoundFile(filepath, "r") as file_:
|
||||||
|
if file_.format != "WAV" or normalize:
|
||||||
|
dtype = "float32"
|
||||||
|
elif file_.subtype not in _SUBTYPE2DTYPE:
|
||||||
|
raise ValueError(f"Unsupported subtype: {file_.subtype}")
|
||||||
|
else:
|
||||||
|
dtype = _SUBTYPE2DTYPE[file_.subtype]
|
||||||
|
|
||||||
|
frames = file_._prepare_read(frame_offset, None, num_frames)
|
||||||
|
waveform = file_.read(frames, dtype, always_2d=True)
|
||||||
|
sample_rate = file_.samplerate
|
||||||
|
|
||||||
|
waveform = paddle.to_tensor(waveform)
|
||||||
|
if channels_first:
|
||||||
|
waveform = paddle.transpose(waveform, perm=[1, 0])
|
||||||
|
return waveform, sample_rate
|
||||||
|
|
||||||
|
|
||||||
|
# Mapping from soundfile subtype to number of bits per sample.
|
||||||
|
# This is mostly heuristical and the value is set to 0 when it is irrelevant
|
||||||
|
# (lossy formats) or when it can't be inferred.
|
||||||
|
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
|
||||||
|
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
|
||||||
|
# the default seems to be 8 bits but it can be compressed further to 4 bits.
|
||||||
|
# The dict is inspired from
|
||||||
|
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
|
||||||
|
_SUBTYPE_TO_BITS_PER_SAMPLE = {
|
||||||
|
"PCM_S8": 8, # Signed 8 bit data
|
||||||
|
"PCM_16": 16, # Signed 16 bit data
|
||||||
|
"PCM_24": 24, # Signed 24 bit data
|
||||||
|
"PCM_32": 32, # Signed 32 bit data
|
||||||
|
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
|
||||||
|
"FLOAT": 32, # 32 bit float data
|
||||||
|
"DOUBLE": 64, # 64 bit float data
|
||||||
|
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
||||||
|
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
||||||
|
"IMA_ADPCM": 0, # IMA ADPCM.
|
||||||
|
"MS_ADPCM": 0, # Microsoft ADPCM.
|
||||||
|
"GSM610":
|
||||||
|
0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
|
||||||
|
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
|
||||||
|
"G721_32": 0, # 32kbs G721 ADPCM encoding.
|
||||||
|
"G723_24": 0, # 24kbs G723 ADPCM encoding.
|
||||||
|
"G723_40": 0, # 40kbs G723 ADPCM encoding.
|
||||||
|
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
|
||||||
|
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
|
||||||
|
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
|
||||||
|
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
|
||||||
|
"DPCM_8": 8, # 8 bit differential PCM (XI only)
|
||||||
|
"DPCM_16": 16, # 16 bit differential PCM (XI only)
|
||||||
|
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
|
||||||
|
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
|
||||||
|
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
|
||||||
|
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
|
||||||
|
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_bit_depth(subtype):
|
||||||
|
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
|
||||||
|
warnings.warn(
|
||||||
|
f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
|
||||||
|
"attribute will be set to 0. If you are seeing this warning, please "
|
||||||
|
"report by opening an issue on github (after checking for existing/closed ones). "
|
||||||
|
"You may otherwise ignore this warning.")
|
||||||
|
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
|
||||||
|
|
||||||
|
|
||||||
|
_SUBTYPE_TO_ENCODING = {
|
||||||
|
"PCM_S8": "PCM_S",
|
||||||
|
"PCM_16": "PCM_S",
|
||||||
|
"PCM_24": "PCM_S",
|
||||||
|
"PCM_32": "PCM_S",
|
||||||
|
"PCM_U8": "PCM_U",
|
||||||
|
"FLOAT": "PCM_F",
|
||||||
|
"DOUBLE": "PCM_F",
|
||||||
|
"ULAW": "ULAW",
|
||||||
|
"ALAW": "ALAW",
|
||||||
|
"VORBIS": "VORBIS",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_encoding(format: str, subtype: str):
|
||||||
|
if format == "FLAC":
|
||||||
|
return "FLAC"
|
||||||
|
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
|
||||||
|
|
||||||
|
|
||||||
|
def info(filepath: str, format: Optional[str]=None) -> AudioInfo:
|
||||||
|
"""Get signal information of an audio file.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||||
|
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath (path-like object or file-like object):
|
||||||
|
Source of audio data.
|
||||||
|
format (str or None, optional):
|
||||||
|
Not used. PySoundFile does not accept format hint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AudioInfo: meta data of the given audio.
|
||||||
|
|
||||||
|
"""
|
||||||
|
sinfo = soundfile.info(filepath)
|
||||||
|
return AudioInfo(
|
||||||
|
sinfo.samplerate,
|
||||||
|
sinfo.frames,
|
||||||
|
sinfo.channels,
|
||||||
|
bits_per_sample=_get_bit_depth(sinfo.subtype),
|
||||||
|
encoding=_get_encoding(sinfo.format, sinfo.subtype), )
|
@ -0,0 +1,106 @@
|
|||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddleaudio
|
||||||
|
from paddle import Tensor
|
||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
|
||||||
|
from .common import AudioInfo
|
||||||
|
|
||||||
|
#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
|
||||||
|
|
||||||
|
|
||||||
|
def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
|
||||||
|
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
|
||||||
|
|
||||||
|
|
||||||
|
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
|
||||||
|
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
|
||||||
|
|
||||||
|
|
||||||
|
# Note: need to comply TorchScript syntax -- need annotation and no f-string
|
||||||
|
def _fail_load(
|
||||||
|
filepath: str,
|
||||||
|
frame_offset: int=0,
|
||||||
|
num_frames: int=-1,
|
||||||
|
normalize: bool=True,
|
||||||
|
channels_first: bool=True,
|
||||||
|
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||||
|
raise RuntimeError("Failed to load audio from {}".format(filepath))
|
||||||
|
|
||||||
|
|
||||||
|
def _fail_load_fileobj(fileobj, *args, **kwargs):
|
||||||
|
raise RuntimeError(f"Failed to load audio from {fileobj}")
|
||||||
|
|
||||||
|
|
||||||
|
_fallback_info = _fail_info
|
||||||
|
_fallback_info_fileobj = _fail_info_fileobj
|
||||||
|
_fallback_load = _fail_load
|
||||||
|
_fallback_load_filebj = _fail_load_fileobj
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def load(
|
||||||
|
filepath: str,
|
||||||
|
frame_offset: int=0,
|
||||||
|
num_frames: int=-1,
|
||||||
|
normalize: bool=True,
|
||||||
|
channels_first: bool=True,
|
||||||
|
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||||
|
if hasattr(filepath, "read"):
|
||||||
|
ret = paddleaudio._paddleaudio.load_audio_fileobj(
|
||||||
|
filepath, frame_offset, num_frames, normalize, channels_first,
|
||||||
|
format)
|
||||||
|
if ret is not None:
|
||||||
|
audio_tensor = paddle.to_tensor(ret[0])
|
||||||
|
return (audio_tensor, ret[1])
|
||||||
|
return _fallback_load_fileobj(filepath, frame_offset, num_frames,
|
||||||
|
normalize, channels_first, format)
|
||||||
|
filepath = os.fspath(filepath)
|
||||||
|
ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
|
||||||
|
filepath, frame_offset, num_frames, normalize, channels_first, format)
|
||||||
|
if ret is not None:
|
||||||
|
audio_tensor = paddle.to_tensor(ret[0])
|
||||||
|
return (audio_tensor, ret[1])
|
||||||
|
return _fallback_load(filepath, frame_offset, num_frames, normalize,
|
||||||
|
channels_first, format)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def save(
|
||||||
|
filepath: str,
|
||||||
|
src: Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
channels_first: bool=True,
|
||||||
|
compression: Optional[float]=None,
|
||||||
|
format: Optional[str]=None,
|
||||||
|
encoding: Optional[str]=None,
|
||||||
|
bits_per_sample: Optional[int]=None, ):
|
||||||
|
src_arr = src.numpy()
|
||||||
|
if hasattr(filepath, "write"):
|
||||||
|
paddleaudio._paddleaudio.save_audio_fileobj(
|
||||||
|
filepath, src_arr, sample_rate, channels_first, compression, format,
|
||||||
|
encoding, bits_per_sample)
|
||||||
|
return
|
||||||
|
filepath = os.fspath(filepath)
|
||||||
|
paddleaudio._paddleaudio.sox_io_save_audio_file(
|
||||||
|
filepath, src_arr, sample_rate, channels_first, compression, format,
|
||||||
|
encoding, bits_per_sample)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def info(
|
||||||
|
filepath: str,
|
||||||
|
format: Optional[str]=None, ) -> AudioInfo:
|
||||||
|
if hasattr(filepath, "read"):
|
||||||
|
sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
|
||||||
|
if sinfo is not None:
|
||||||
|
return AudioInfo(*sinfo)
|
||||||
|
return _fallback_info_fileobj(filepath, format)
|
||||||
|
filepath = os.fspath(filepath)
|
||||||
|
sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
|
||||||
|
if sinfo is not None:
|
||||||
|
return AudioInfo(*sinfo)
|
||||||
|
return _fallback_info(filepath, format)
|
@ -0,0 +1,83 @@
|
|||||||
|
"""Defines utilities for switching audio backends"""
|
||||||
|
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
|
||||||
|
import warnings
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import paddleaudio
|
||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
|
||||||
|
from . import no_backend
|
||||||
|
from . import soundfile_backend
|
||||||
|
from . import sox_io_backend
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"list_audio_backends",
|
||||||
|
"get_audio_backend",
|
||||||
|
"set_audio_backend",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def list_audio_backends() -> List[str]:
|
||||||
|
"""List available backends
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: The list of available backends.
|
||||||
|
"""
|
||||||
|
backends = []
|
||||||
|
if _mod_utils.is_module_available("soundfile"):
|
||||||
|
backends.append("soundfile")
|
||||||
|
if _mod_utils.is_sox_available():
|
||||||
|
backends.append("sox_io")
|
||||||
|
return backends
|
||||||
|
|
||||||
|
|
||||||
|
def set_audio_backend(backend: Optional[str]):
|
||||||
|
"""Set the backend for I/O operation
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend (str or None): Name of the backend.
|
||||||
|
One of ``"sox_io"`` or ``"soundfile"`` based on availability
|
||||||
|
of the system. If ``None`` is provided the current backend is unassigned.
|
||||||
|
"""
|
||||||
|
if backend is not None and backend not in list_audio_backends():
|
||||||
|
raise RuntimeError(f'Backend "{backend}" is not one of '
|
||||||
|
f"available backends: {list_audio_backends()}.")
|
||||||
|
|
||||||
|
if backend is None:
|
||||||
|
module = no_backend
|
||||||
|
elif backend == "sox_io":
|
||||||
|
module = sox_io_backend
|
||||||
|
elif backend == "soundfile":
|
||||||
|
module = soundfile_backend
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'Unexpected backend "{backend}"')
|
||||||
|
|
||||||
|
for func in ["save", "load", "info"]:
|
||||||
|
setattr(paddleaudio, func, getattr(module, func))
|
||||||
|
|
||||||
|
|
||||||
|
def _init_audio_backend():
|
||||||
|
backends = list_audio_backends()
|
||||||
|
if "soundfile" in backends:
|
||||||
|
set_audio_backend("soundfile")
|
||||||
|
elif "sox_io" in backends:
|
||||||
|
set_audio_backend("sox_io")
|
||||||
|
else:
|
||||||
|
warnings.warn("No audio backend is available.")
|
||||||
|
set_audio_backend(None)
|
||||||
|
|
||||||
|
|
||||||
|
def get_audio_backend() -> Optional[str]:
|
||||||
|
"""Get the name of the current backend
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
|
||||||
|
"""
|
||||||
|
if paddleaudio.load == no_backend.load:
|
||||||
|
return None
|
||||||
|
if paddleaudio.load == sox_io_backend.load:
|
||||||
|
return "sox_io"
|
||||||
|
if paddleaudio.load == soundfile_backend.load:
|
||||||
|
return "soundfile"
|
||||||
|
raise ValueError("Unknown backend.")
|
@ -0,0 +1,132 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import paddleaudio
|
||||||
|
from paddleaudio._internal import module_utils
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'fbank',
|
||||||
|
'pitch',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@module_utils.requires_kaldi()
|
||||||
|
def fbank(
|
||||||
|
wav,
|
||||||
|
samp_freq: int=16000,
|
||||||
|
frame_shift_ms: float=10.0,
|
||||||
|
frame_length_ms: float=25.0,
|
||||||
|
dither: float=0.0,
|
||||||
|
preemph_coeff: float=0.97,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
window_type: str='povey',
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
allow_downsample: bool=False,
|
||||||
|
allow_upsample: bool=False,
|
||||||
|
max_feature_vectors: int=-1,
|
||||||
|
num_bins: int=23,
|
||||||
|
low_freq: float=20,
|
||||||
|
high_freq: float=0,
|
||||||
|
vtln_low: float=100,
|
||||||
|
vtln_high: float=-500,
|
||||||
|
debug_mel: bool=False,
|
||||||
|
htk_mode: bool=False,
|
||||||
|
use_energy: bool=False, # fbank opts
|
||||||
|
energy_floor: float=0.0,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
htk_compat: bool=False,
|
||||||
|
use_log_fbank: bool=True,
|
||||||
|
use_power: bool=True):
|
||||||
|
frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
|
||||||
|
mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
|
||||||
|
fbank_opts = paddleaudio._paddleaudio.FbankOptions()
|
||||||
|
frame_opts.samp_freq = samp_freq
|
||||||
|
frame_opts.frame_shift_ms = frame_shift_ms
|
||||||
|
frame_opts.frame_length_ms = frame_length_ms
|
||||||
|
frame_opts.dither = dither
|
||||||
|
frame_opts.preemph_coeff = preemph_coeff
|
||||||
|
frame_opts.remove_dc_offset = remove_dc_offset
|
||||||
|
frame_opts.window_type = window_type
|
||||||
|
frame_opts.round_to_power_of_two = round_to_power_of_two
|
||||||
|
frame_opts.blackman_coeff = blackman_coeff
|
||||||
|
frame_opts.snip_edges = snip_edges
|
||||||
|
frame_opts.allow_downsample = allow_downsample
|
||||||
|
frame_opts.allow_upsample = allow_upsample
|
||||||
|
frame_opts.max_feature_vectors = max_feature_vectors
|
||||||
|
|
||||||
|
mel_opts.num_bins = num_bins
|
||||||
|
mel_opts.low_freq = low_freq
|
||||||
|
mel_opts.high_freq = high_freq
|
||||||
|
mel_opts.vtln_low = vtln_low
|
||||||
|
mel_opts.vtln_high = vtln_high
|
||||||
|
mel_opts.debug_mel = debug_mel
|
||||||
|
mel_opts.htk_mode = htk_mode
|
||||||
|
|
||||||
|
fbank_opts.use_energy = use_energy
|
||||||
|
fbank_opts.energy_floor = energy_floor
|
||||||
|
fbank_opts.raw_energy = raw_energy
|
||||||
|
fbank_opts.htk_compat = htk_compat
|
||||||
|
fbank_opts.use_log_fbank = use_log_fbank
|
||||||
|
fbank_opts.use_power = use_power
|
||||||
|
feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts,
|
||||||
|
fbank_opts, wav)
|
||||||
|
return feat
|
||||||
|
|
||||||
|
|
||||||
|
@module_utils.requires_kaldi()
|
||||||
|
def pitch(wav,
|
||||||
|
samp_freq: int=16000,
|
||||||
|
frame_shift_ms: float=10.0,
|
||||||
|
frame_length_ms: float=25.0,
|
||||||
|
preemph_coeff: float=0.0,
|
||||||
|
min_f0: int=50,
|
||||||
|
max_f0: int=400,
|
||||||
|
soft_min_f0: float=10.0,
|
||||||
|
penalty_factor: float=0.1,
|
||||||
|
lowpass_cutoff: int=1000,
|
||||||
|
resample_freq: int=4000,
|
||||||
|
delta_pitch: float=0.005,
|
||||||
|
nccf_ballast: int=7000,
|
||||||
|
lowpass_filter_width: int=1,
|
||||||
|
upsample_filter_width: int=5,
|
||||||
|
max_frames_latency: int=0,
|
||||||
|
frames_per_chunk: int=0,
|
||||||
|
simulate_first_pass_online: bool=False,
|
||||||
|
recompute_frame: int=500,
|
||||||
|
nccf_ballast_online: bool=False,
|
||||||
|
snip_edges: bool=True):
|
||||||
|
pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
|
||||||
|
pitch_opts.samp_freq = samp_freq
|
||||||
|
pitch_opts.frame_shift_ms = frame_shift_ms
|
||||||
|
pitch_opts.frame_length_ms = frame_length_ms
|
||||||
|
pitch_opts.preemph_coeff = preemph_coeff
|
||||||
|
pitch_opts.min_f0 = min_f0
|
||||||
|
pitch_opts.max_f0 = max_f0
|
||||||
|
pitch_opts.soft_min_f0 = soft_min_f0
|
||||||
|
pitch_opts.penalty_factor = penalty_factor
|
||||||
|
pitch_opts.lowpass_cutoff = lowpass_cutoff
|
||||||
|
pitch_opts.resample_freq = resample_freq
|
||||||
|
pitch_opts.delta_pitch = delta_pitch
|
||||||
|
pitch_opts.nccf_ballast = nccf_ballast
|
||||||
|
pitch_opts.lowpass_filter_width = lowpass_filter_width
|
||||||
|
pitch_opts.upsample_filter_width = upsample_filter_width
|
||||||
|
pitch_opts.max_frames_latency = max_frames_latency
|
||||||
|
pitch_opts.frames_per_chunk = frames_per_chunk
|
||||||
|
pitch_opts.simulate_first_pass_online = simulate_first_pass_online
|
||||||
|
pitch_opts.recompute_frame = recompute_frame
|
||||||
|
pitch_opts.nccf_ballast_online = nccf_ballast_online
|
||||||
|
pitch_opts.snip_edges = snip_edges
|
||||||
|
pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
|
||||||
|
return pitch
|
@ -0,0 +1,21 @@
|
|||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
|
||||||
|
from .sox_effects import apply_effects_file
|
||||||
|
from .sox_effects import apply_effects_tensor
|
||||||
|
from .sox_effects import effect_names
|
||||||
|
from .sox_effects import init_sox_effects
|
||||||
|
from .sox_effects import shutdown_sox_effects
|
||||||
|
|
||||||
|
if _mod_utils.is_sox_available():
|
||||||
|
import atexit
|
||||||
|
|
||||||
|
init_sox_effects()
|
||||||
|
atexit.register(shutdown_sox_effects)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"init_sox_effects",
|
||||||
|
"shutdown_sox_effects",
|
||||||
|
"effect_names",
|
||||||
|
"apply_effects_tensor",
|
||||||
|
"apply_effects_file",
|
||||||
|
]
|
@ -0,0 +1,241 @@
|
|||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddleaudio
|
||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
from paddleaudio.utils.sox_utils import list_effects
|
||||||
|
|
||||||
|
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def init_sox_effects():
|
||||||
|
"""Initialize resources required to use sox effects.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
You do not need to call this function manually. It is called automatically.
|
||||||
|
|
||||||
|
Once initialized, you do not need to call this function again across the multiple uses of
|
||||||
|
sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
|
||||||
|
Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
|
||||||
|
again will result in error.
|
||||||
|
"""
|
||||||
|
paddleaudio._paddleaudio.sox_effects_initialize_sox_effects()
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def shutdown_sox_effects():
|
||||||
|
"""Clean up resources required to use sox effects.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
You do not need to call this function manually. It is called automatically.
|
||||||
|
|
||||||
|
It is safe to call this function multiple times.
|
||||||
|
Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
|
||||||
|
initializing again will result in error.
|
||||||
|
"""
|
||||||
|
paddleaudio._paddleaudio.sox_effects_shutdown_sox_effects()
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def effect_names() -> List[str]:
|
||||||
|
"""Gets list of valid sox effect names
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: list of available effect names.
|
||||||
|
|
||||||
|
Example
|
||||||
|
>>> paddleaudio.sox_effects.effect_names()
|
||||||
|
['allpass', 'band', 'bandpass', ... ]
|
||||||
|
"""
|
||||||
|
return list(list_effects().keys())
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def apply_effects_tensor(
|
||||||
|
tensor: paddle.Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
effects: List[List[str]],
|
||||||
|
channels_first: bool=True, ) -> Tuple[paddle.Tensor, int]:
|
||||||
|
"""Apply sox effects to given Tensor
|
||||||
|
|
||||||
|
.. devices:: CPU
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This function only works on CPU Tensors.
|
||||||
|
This function works in the way very similar to ``sox`` command, however there are slight
|
||||||
|
differences. For example, ``sox`` command adds certain effects automatically (such as
|
||||||
|
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
|
||||||
|
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
|
||||||
|
need to give ``rate`` effect with desired sampling rate.).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tensor (paddle.Tensor): Input 2D CPU Tensor.
|
||||||
|
sample_rate (int): Sample rate
|
||||||
|
effects (List[List[str]]): List of effects.
|
||||||
|
channels_first (bool, optional): Indicates if the input Tensor's dimension is
|
||||||
|
`[channels, time]` or `[time, channels]`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(Tensor, int): Resulting Tensor and sample rate.
|
||||||
|
The resulting Tensor has the same ``dtype`` as the input Tensor, and
|
||||||
|
the same channels order. The shape of the Tensor can be different based on the
|
||||||
|
effects applied. Sample rate can also be different based on the effects applied.
|
||||||
|
|
||||||
|
Example - Basic usage
|
||||||
|
>>>
|
||||||
|
>>> # Defines the effects to apply
|
||||||
|
>>> effects = [
|
||||||
|
... ['gain', '-n'], # normalises to 0dB
|
||||||
|
... ['pitch', '5'], # 5 cent pitch shift
|
||||||
|
... ['rate', '8000'], # resample to 8000 Hz
|
||||||
|
... ]
|
||||||
|
>>>
|
||||||
|
>>> # Generate pseudo wave:
|
||||||
|
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
|
||||||
|
>>> sample_rate = 16000
|
||||||
|
>>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
|
||||||
|
>>> waveform.shape
|
||||||
|
paddle.Size([2, 16000])
|
||||||
|
>>> waveform
|
||||||
|
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
|
||||||
|
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
|
||||||
|
>>>
|
||||||
|
>>> # Apply effects
|
||||||
|
>>> waveform, sample_rate = apply_effects_tensor(
|
||||||
|
... wave_form, sample_rate, effects, channels_first=True)
|
||||||
|
>>>
|
||||||
|
>>> # Check the result
|
||||||
|
>>> # The new waveform is sampling rate 8000, 1 second.
|
||||||
|
>>> # normalization and channel order are preserved
|
||||||
|
>>> waveform.shape
|
||||||
|
paddle.Size([2, 8000])
|
||||||
|
>>> waveform
|
||||||
|
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
|
||||||
|
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
|
||||||
|
>>> sample_rate
|
||||||
|
8000
|
||||||
|
|
||||||
|
"""
|
||||||
|
tensor_np = tensor.numpy()
|
||||||
|
ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate,
|
||||||
|
effects, channels_first)
|
||||||
|
if ret is not None:
|
||||||
|
return (paddle.to_tensor(ret[0]), ret[1])
|
||||||
|
raise RuntimeError("Failed to apply sox effect")
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def apply_effects_file(
|
||||||
|
path: str,
|
||||||
|
effects: List[List[str]],
|
||||||
|
normalize: bool=True,
|
||||||
|
channels_first: bool=True,
|
||||||
|
format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
|
||||||
|
"""Apply sox effects to the audio file and load the resulting data as Tensor
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This function works in the way very similar to ``sox`` command, however there are slight
|
||||||
|
differences. For example, ``sox`` commnad adds certain effects automatically (such as
|
||||||
|
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
|
||||||
|
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
|
||||||
|
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
|
||||||
|
rate and leave samples untouched.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (path-like object or file-like object):
|
||||||
|
effects (List[List[str]]): List of effects.
|
||||||
|
normalize (bool, optional):
|
||||||
|
When ``True``, this function always return ``float32``, and sample values are
|
||||||
|
normalized to ``[-1.0, 1.0]``.
|
||||||
|
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
||||||
|
integer type. This argument has no effect for formats other
|
||||||
|
than integer WAV type.
|
||||||
|
channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
|
||||||
|
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
||||||
|
format (str or None, optional):
|
||||||
|
Override the format detection with the given format.
|
||||||
|
Providing the argument might help when libsox can not infer the format
|
||||||
|
from header or extension,
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(Tensor, int): Resulting Tensor and sample rate.
|
||||||
|
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
|
||||||
|
If ``normalize=False`` and the input audio file is of integer WAV file, then the
|
||||||
|
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
|
||||||
|
If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
|
||||||
|
otherwise `[time, channel]`.
|
||||||
|
|
||||||
|
Example - Basic usage
|
||||||
|
>>>
|
||||||
|
>>> # Defines the effects to apply
|
||||||
|
>>> effects = [
|
||||||
|
... ['gain', '-n'], # normalises to 0dB
|
||||||
|
... ['pitch', '5'], # 5 cent pitch shift
|
||||||
|
... ['rate', '8000'], # resample to 8000 Hz
|
||||||
|
... ]
|
||||||
|
>>>
|
||||||
|
>>> # Apply effects and load data with channels_first=True
|
||||||
|
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
|
||||||
|
>>>
|
||||||
|
>>> # Check the result
|
||||||
|
>>> waveform.shape
|
||||||
|
paddle.Size([2, 8000])
|
||||||
|
>>> waveform
|
||||||
|
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
|
||||||
|
-1.4761e-07, 1.8114e-07],
|
||||||
|
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
|
||||||
|
-5.6159e-07, 4.8103e-07]])
|
||||||
|
>>> sample_rate
|
||||||
|
8000
|
||||||
|
|
||||||
|
Example - Apply random speed perturbation to dataset
|
||||||
|
>>>
|
||||||
|
>>> # Load data from file, apply random speed perturbation
|
||||||
|
>>> class RandomPerturbationFile(paddle.utils.data.Dataset):
|
||||||
|
... \"\"\"Given flist, apply random speed perturbation
|
||||||
|
...
|
||||||
|
... Suppose all the input files are at least one second long.
|
||||||
|
... \"\"\"
|
||||||
|
... def __init__(self, flist: List[str], sample_rate: int):
|
||||||
|
... super().__init__()
|
||||||
|
... self.flist = flist
|
||||||
|
... self.sample_rate = sample_rate
|
||||||
|
...
|
||||||
|
... def __getitem__(self, index):
|
||||||
|
... speed = 0.5 + 1.5 * random.randn()
|
||||||
|
... effects = [
|
||||||
|
... ['gain', '-n', '-10'], # apply 10 db attenuation
|
||||||
|
... ['remix', '-'], # merge all the channels
|
||||||
|
... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
|
||||||
|
... ['rate', f'{self.sample_rate}'],
|
||||||
|
... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
|
||||||
|
... ['trim', '0', '2'], # get the first 2 seconds
|
||||||
|
... ]
|
||||||
|
... waveform, _ = paddleaudio.sox_effects.apply_effects_file(
|
||||||
|
... self.flist[index], effects)
|
||||||
|
... return waveform
|
||||||
|
...
|
||||||
|
... def __len__(self):
|
||||||
|
... return len(self.flist)
|
||||||
|
...
|
||||||
|
>>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
|
||||||
|
>>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
|
||||||
|
>>> for batch in loader:
|
||||||
|
>>> pass
|
||||||
|
"""
|
||||||
|
if hasattr(path, "read"):
|
||||||
|
ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize,
|
||||||
|
channels_first, format)
|
||||||
|
if ret is None:
|
||||||
|
raise RuntimeError("Failed to load audio from {}".format(path))
|
||||||
|
return (paddle.to_tensor(ret[0]), ret[1])
|
||||||
|
path = os.fspath(path)
|
||||||
|
ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file(path, effects, normalize,
|
||||||
|
channels_first, format)
|
||||||
|
if ret is not None:
|
||||||
|
return (paddle.to_tensor(ret[0]), ret[1])
|
||||||
|
raise RuntimeError("Failed to load audio from {}".format(path))
|
@ -0,0 +1,217 @@
|
|||||||
|
if (MSVC)
|
||||||
|
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(APPLE)
|
||||||
|
set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
|
||||||
|
endif(APPLE)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# libpaddleaudio
|
||||||
|
################################################################################
|
||||||
|
set(
|
||||||
|
LIBPADDLEAUDIO_SOURCES
|
||||||
|
utils.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
set(
|
||||||
|
LIBPADDLEAUDIO_INCLUDE_DIRS
|
||||||
|
${PROJECT_SOURCE_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
set(
|
||||||
|
LIBPADDLEAUDIO_LINK_LIBRARIES
|
||||||
|
)
|
||||||
|
|
||||||
|
set(
|
||||||
|
LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
|
||||||
|
|
||||||
|
#------------------------------------------------------------------------------#
|
||||||
|
# START OF CUSTOMIZATION LOGICS
|
||||||
|
#------------------------------------------------------------------------------#
|
||||||
|
|
||||||
|
if(BUILD_SOX)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
LIBPADDLEAUDIO_LINK_LIBRARIES
|
||||||
|
libsox
|
||||||
|
)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
LIBPADDLEAUDIO_SOURCES
|
||||||
|
)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
|
||||||
|
INCLUDE_SOX
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if(BUILD_KALDI)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
LIBPADDLEAUDIO_LINK_LIBRARIES
|
||||||
|
libkaldi
|
||||||
|
)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
|
||||||
|
INCLUDE_KALDI
|
||||||
|
COMPILE_WITHOUT_OPENFST
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#------------------------------------------------------------------------------#
|
||||||
|
# END OF CUSTOMIZATION LOGICS
|
||||||
|
#------------------------------------------------------------------------------#
|
||||||
|
|
||||||
|
function (define_library name source include_dirs link_libraries compile_defs)
|
||||||
|
add_library(${name} SHARED ${source})
|
||||||
|
target_include_directories(${name} PRIVATE ${include_dirs})
|
||||||
|
target_link_libraries(${name} ${link_libraries})
|
||||||
|
target_compile_definitions(${name} PRIVATE ${compile_defs})
|
||||||
|
set_target_properties(${name} PROPERTIES PREFIX "")
|
||||||
|
if (MSVC)
|
||||||
|
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
|
||||||
|
endif(MSVC)
|
||||||
|
|
||||||
|
install(
|
||||||
|
TARGETS ${name}
|
||||||
|
LIBRARY DESTINATION lib
|
||||||
|
RUNTIME DESTINATION lib # For Windows
|
||||||
|
)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
|
||||||
|
define_library(
|
||||||
|
libpaddleaudio
|
||||||
|
"${LIBPADDLEAUDIO_SOURCES}"
|
||||||
|
"${LIBPADDLEAUDIO_INCLUDE_DIRS}"
|
||||||
|
"${LIBPADDLEAUDIO_LINK_LIBRARIES}"
|
||||||
|
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
|
||||||
|
endif(APPLE)
|
||||||
|
|
||||||
|
if (UNIX AND NOT APPLE)
|
||||||
|
set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
|
||||||
|
else()
|
||||||
|
set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# _paddleaudio.so
|
||||||
|
################################################################################
|
||||||
|
if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
|
||||||
|
if (WIN32)
|
||||||
|
find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
|
||||||
|
set(ADDITIONAL_ITEMS Python3::Python)
|
||||||
|
endif()
|
||||||
|
function(define_extension name sources include_dirs libraries definitions)
|
||||||
|
add_library(${name} SHARED ${sources})
|
||||||
|
target_compile_definitions(${name} PRIVATE "${definitions}")
|
||||||
|
target_include_directories(
|
||||||
|
${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
|
||||||
|
target_link_libraries(
|
||||||
|
${name}
|
||||||
|
${libraries}
|
||||||
|
${PYTHON_LIBRARY}
|
||||||
|
${ADDITIONAL_ITEMS}
|
||||||
|
)
|
||||||
|
set_target_properties(${name} PROPERTIES PREFIX "")
|
||||||
|
if (MSVC)
|
||||||
|
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
|
||||||
|
endif(MSVC)
|
||||||
|
if (APPLE)
|
||||||
|
# https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
|
||||||
|
# https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
|
||||||
|
set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
|
||||||
|
endif()
|
||||||
|
install(
|
||||||
|
TARGETS ${name}
|
||||||
|
LIBRARY DESTINATION .
|
||||||
|
RUNTIME DESTINATION . # For Windows
|
||||||
|
)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
set(
|
||||||
|
EXTENSION_SOURCES
|
||||||
|
pybind/pybind.cpp
|
||||||
|
)
|
||||||
|
#----------------------------------------------------------------------------#
|
||||||
|
# START OF CUSTOMIZATION LOGICS
|
||||||
|
#----------------------------------------------------------------------------#
|
||||||
|
if(BUILD_SOX)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
EXTENSION_SOURCES
|
||||||
|
pybind/sox/effects.cpp
|
||||||
|
pybind/sox/effects_chain.cpp
|
||||||
|
pybind/sox/io.cpp
|
||||||
|
pybind/sox/types.cpp
|
||||||
|
pybind/sox/utils.cpp
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(BUILD_KALDI)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
EXTENSION_SOURCES
|
||||||
|
pybind/kaldi/kaldi_feature_wrapper.cc
|
||||||
|
pybind/kaldi/kaldi_feature.cc
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
#----------------------------------------------------------------------------#
|
||||||
|
# END OF CUSTOMIZATION LOGICS
|
||||||
|
#----------------------------------------------------------------------------#
|
||||||
|
define_extension(
|
||||||
|
_paddleaudio
|
||||||
|
"${EXTENSION_SOURCES}"
|
||||||
|
""
|
||||||
|
libpaddleaudio
|
||||||
|
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
|
||||||
|
)
|
||||||
|
# if(BUILD_CTC_DECODER)
|
||||||
|
# set(
|
||||||
|
# DECODER_EXTENSION_SOURCES
|
||||||
|
# decoder/bindings/pybind.cpp
|
||||||
|
# )
|
||||||
|
# define_extension(
|
||||||
|
# _paddleaudio_decoder
|
||||||
|
# "${DECODER_EXTENSION_SOURCES}"
|
||||||
|
# ""
|
||||||
|
# "libpaddleaudio_decoder"
|
||||||
|
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
|
||||||
|
# )
|
||||||
|
# endif()
|
||||||
|
# if(USE_FFMPEG)
|
||||||
|
# set(
|
||||||
|
# FFMPEG_EXTENSION_SOURCES
|
||||||
|
# ffmpeg/pybind/typedefs.cpp
|
||||||
|
# ffmpeg/pybind/pybind.cpp
|
||||||
|
# ffmpeg/pybind/stream_reader.cpp
|
||||||
|
# )
|
||||||
|
# define_extension(
|
||||||
|
# _paddleaudio_ffmpeg
|
||||||
|
# "${FFMPEG_EXTENSION_SOURCES}"
|
||||||
|
# "${FFMPEG_INCLUDE_DIRS}"
|
||||||
|
# "libpaddleaudio_ffmpeg"
|
||||||
|
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
|
||||||
|
# )
|
||||||
|
# endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
|
||||||
|
endif(APPLE)
|
||||||
|
|
||||||
|
if (UNIX AND NOT APPLE)
|
||||||
|
set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
|
||||||
|
endif()
|
@ -0,0 +1,121 @@
|
|||||||
|
Creative Commons Legal Code
|
||||||
|
|
||||||
|
CC0 1.0 Universal
|
||||||
|
|
||||||
|
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
||||||
|
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
|
||||||
|
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
||||||
|
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
||||||
|
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
|
||||||
|
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
|
||||||
|
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
|
||||||
|
HEREUNDER.
|
||||||
|
|
||||||
|
Statement of Purpose
|
||||||
|
|
||||||
|
The laws of most jurisdictions throughout the world automatically confer
|
||||||
|
exclusive Copyright and Related Rights (defined below) upon the creator
|
||||||
|
and subsequent owner(s) (each and all, an "owner") of an original work of
|
||||||
|
authorship and/or a database (each, a "Work").
|
||||||
|
|
||||||
|
Certain owners wish to permanently relinquish those rights to a Work for
|
||||||
|
the purpose of contributing to a commons of creative, cultural and
|
||||||
|
scientific works ("Commons") that the public can reliably and without fear
|
||||||
|
of later claims of infringement build upon, modify, incorporate in other
|
||||||
|
works, reuse and redistribute as freely as possible in any form whatsoever
|
||||||
|
and for any purposes, including without limitation commercial purposes.
|
||||||
|
These owners may contribute to the Commons to promote the ideal of a free
|
||||||
|
culture and the further production of creative, cultural and scientific
|
||||||
|
works, or to gain reputation or greater distribution for their Work in
|
||||||
|
part through the use and efforts of others.
|
||||||
|
|
||||||
|
For these and/or other purposes and motivations, and without any
|
||||||
|
expectation of additional consideration or compensation, the person
|
||||||
|
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
|
||||||
|
is an owner of Copyright and Related Rights in the Work, voluntarily
|
||||||
|
elects to apply CC0 to the Work and publicly distribute the Work under its
|
||||||
|
terms, with knowledge of his or her Copyright and Related Rights in the
|
||||||
|
Work and the meaning and intended legal effect of CC0 on those rights.
|
||||||
|
|
||||||
|
1. Copyright and Related Rights. A Work made available under CC0 may be
|
||||||
|
protected by copyright and related or neighboring rights ("Copyright and
|
||||||
|
Related Rights"). Copyright and Related Rights include, but are not
|
||||||
|
limited to, the following:
|
||||||
|
|
||||||
|
i. the right to reproduce, adapt, distribute, perform, display,
|
||||||
|
communicate, and translate a Work;
|
||||||
|
ii. moral rights retained by the original author(s) and/or performer(s);
|
||||||
|
iii. publicity and privacy rights pertaining to a person's image or
|
||||||
|
likeness depicted in a Work;
|
||||||
|
iv. rights protecting against unfair competition in regards to a Work,
|
||||||
|
subject to the limitations in paragraph 4(a), below;
|
||||||
|
v. rights protecting the extraction, dissemination, use and reuse of data
|
||||||
|
in a Work;
|
||||||
|
vi. database rights (such as those arising under Directive 96/9/EC of the
|
||||||
|
European Parliament and of the Council of 11 March 1996 on the legal
|
||||||
|
protection of databases, and under any national implementation
|
||||||
|
thereof, including any amended or successor version of such
|
||||||
|
directive); and
|
||||||
|
vii. other similar, equivalent or corresponding rights throughout the
|
||||||
|
world based on applicable law or treaty, and any national
|
||||||
|
implementations thereof.
|
||||||
|
|
||||||
|
2. Waiver. To the greatest extent permitted by, but not in contravention
|
||||||
|
of, applicable law, Affirmer hereby overtly, fully, permanently,
|
||||||
|
irrevocably and unconditionally waives, abandons, and surrenders all of
|
||||||
|
Affirmer's Copyright and Related Rights and associated claims and causes
|
||||||
|
of action, whether now known or unknown (including existing as well as
|
||||||
|
future claims and causes of action), in the Work (i) in all territories
|
||||||
|
worldwide, (ii) for the maximum duration provided by applicable law or
|
||||||
|
treaty (including future time extensions), (iii) in any current or future
|
||||||
|
medium and for any number of copies, and (iv) for any purpose whatsoever,
|
||||||
|
including without limitation commercial, advertising or promotional
|
||||||
|
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
|
||||||
|
member of the public at large and to the detriment of Affirmer's heirs and
|
||||||
|
successors, fully intending that such Waiver shall not be subject to
|
||||||
|
revocation, rescission, cancellation, termination, or any other legal or
|
||||||
|
equitable action to disrupt the quiet enjoyment of the Work by the public
|
||||||
|
as contemplated by Affirmer's express Statement of Purpose.
|
||||||
|
|
||||||
|
3. Public License Fallback. Should any part of the Waiver for any reason
|
||||||
|
be judged legally invalid or ineffective under applicable law, then the
|
||||||
|
Waiver shall be preserved to the maximum extent permitted taking into
|
||||||
|
account Affirmer's express Statement of Purpose. In addition, to the
|
||||||
|
extent the Waiver is so judged Affirmer hereby grants to each affected
|
||||||
|
person a royalty-free, non transferable, non sublicensable, non exclusive,
|
||||||
|
irrevocable and unconditional license to exercise Affirmer's Copyright and
|
||||||
|
Related Rights in the Work (i) in all territories worldwide, (ii) for the
|
||||||
|
maximum duration provided by applicable law or treaty (including future
|
||||||
|
time extensions), (iii) in any current or future medium and for any number
|
||||||
|
of copies, and (iv) for any purpose whatsoever, including without
|
||||||
|
limitation commercial, advertising or promotional purposes (the
|
||||||
|
"License"). The License shall be deemed effective as of the date CC0 was
|
||||||
|
applied by Affirmer to the Work. Should any part of the License for any
|
||||||
|
reason be judged legally invalid or ineffective under applicable law, such
|
||||||
|
partial invalidity or ineffectiveness shall not invalidate the remainder
|
||||||
|
of the License, and in such case Affirmer hereby affirms that he or she
|
||||||
|
will not (i) exercise any of his or her remaining Copyright and Related
|
||||||
|
Rights in the Work or (ii) assert any associated claims and causes of
|
||||||
|
action with respect to the Work, in either case contrary to Affirmer's
|
||||||
|
express Statement of Purpose.
|
||||||
|
|
||||||
|
4. Limitations and Disclaimers.
|
||||||
|
|
||||||
|
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
||||||
|
surrendered, licensed or otherwise affected by this document.
|
||||||
|
b. Affirmer offers the Work as-is and makes no representations or
|
||||||
|
warranties of any kind concerning the Work, express, implied,
|
||||||
|
statutory or otherwise, including without limitation warranties of
|
||||||
|
title, merchantability, fitness for a particular purpose, non
|
||||||
|
infringement, or the absence of latent or other defects, accuracy, or
|
||||||
|
the present or absence of errors, whether or not discoverable, all to
|
||||||
|
the greatest extent permissible under applicable law.
|
||||||
|
c. Affirmer disclaims responsibility for clearing rights of other persons
|
||||||
|
that may apply to the Work or any use thereof, including without
|
||||||
|
limitation any person's Copyright and Related Rights in the Work.
|
||||||
|
Further, Affirmer disclaims responsibility for obtaining any necessary
|
||||||
|
consents, permissions or other rights required for any use of the
|
||||||
|
Work.
|
||||||
|
d. Affirmer understands and acknowledges that Creative Commons is not a
|
||||||
|
party to this document and has no duty or obligation with respect to
|
||||||
|
this CC0 or use of the Work.
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,49 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "pybind11/pybind11.h"
|
||||||
|
#include "pybind11/numpy.h"
|
||||||
|
#include "feat/feature-window.h"
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace kaldi {
|
||||||
|
|
||||||
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
class StreamingFeatureTpl {
|
||||||
|
public:
|
||||||
|
typedef typename F::Options Options;
|
||||||
|
StreamingFeatureTpl(const Options& opts);
|
||||||
|
bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat>* feats);
|
||||||
|
void Reset() { remained_wav_.Resize(0); }
|
||||||
|
|
||||||
|
int Dim() { return computer_.Dim(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat>* feats);
|
||||||
|
Options opts_;
|
||||||
|
::kaldi::FeatureWindowFunction window_function_;
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
|
||||||
|
F computer_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace kaldi
|
||||||
|
} // namespace ppspeech
|
||||||
|
|
||||||
|
#include "feature_common_inl.h"
|
@ -0,0 +1,93 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace kaldi {
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
|
||||||
|
: opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
|
||||||
|
// window_function_(computer_.GetFrameOptions()) { the opt set to zero
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
bool StreamingFeatureTpl<F>::ComputeFeature(
|
||||||
|
const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
|
||||||
|
// append remaned waves
|
||||||
|
::kaldi::int32 wav_len = wav.Dim();
|
||||||
|
if (wav_len == 0) return false;
|
||||||
|
::kaldi::int32 left_len = remained_wav_.Dim();
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
|
||||||
|
waves.Range(0, left_len).CopyFromVec(remained_wav_);
|
||||||
|
waves.Range(left_len, wav_len).CopyFromVec(wav);
|
||||||
|
|
||||||
|
// cache remaned waves
|
||||||
|
::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
|
||||||
|
::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
|
||||||
|
::kaldi::int32 frame_shift = frame_opts.WindowShift();
|
||||||
|
::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
|
||||||
|
remained_wav_.Resize(left_samples);
|
||||||
|
remained_wav_.CopyFromVec(
|
||||||
|
waves.Range(frame_shift * num_frames, left_samples));
|
||||||
|
|
||||||
|
// compute speech feature
|
||||||
|
Compute(waves, feats);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute feat
|
||||||
|
template <class F>
|
||||||
|
bool StreamingFeatureTpl<F>::Compute(
|
||||||
|
const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
|
||||||
|
::kaldi::BaseFloat vtln_warp = 1.0;
|
||||||
|
const ::kaldi::FrameExtractionOptions& frame_opts =
|
||||||
|
computer_.GetFrameOptions();
|
||||||
|
::kaldi::int32 num_samples = waves.Dim();
|
||||||
|
::kaldi::int32 frame_length = frame_opts.WindowSize();
|
||||||
|
::kaldi::int32 sample_rate = frame_opts.samp_freq;
|
||||||
|
if (num_samples < frame_length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
|
||||||
|
feats->Resize(num_frames * Dim());
|
||||||
|
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat> window;
|
||||||
|
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
|
||||||
|
for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
|
||||||
|
::kaldi::BaseFloat raw_log_energy = 0.0;
|
||||||
|
::kaldi::ExtractWindow(0,
|
||||||
|
waves,
|
||||||
|
frame,
|
||||||
|
frame_opts,
|
||||||
|
window_function_,
|
||||||
|
&window,
|
||||||
|
need_raw_log_energy ? &raw_log_energy : NULL);
|
||||||
|
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
|
||||||
|
::kaldi::kUndefined);
|
||||||
|
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
|
||||||
|
::kaldi::SubVector<::kaldi::BaseFloat> output_row(
|
||||||
|
feats->Data() + frame * Dim(), Dim());
|
||||||
|
output_row.CopyFromVec(this_feature);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace kaldi
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,75 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
|
||||||
|
#include "feat/pitch-functions.h"
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace kaldi {
|
||||||
|
|
||||||
|
bool InitFbank(
|
||||||
|
::kaldi::FrameExtractionOptions frame_opts,
|
||||||
|
::kaldi::MelBanksOptions mel_opts,
|
||||||
|
FbankOptions fbank_opts) {
|
||||||
|
::kaldi::FbankOptions opts;
|
||||||
|
opts.frame_opts = frame_opts;
|
||||||
|
opts.mel_opts = mel_opts;
|
||||||
|
opts.use_energy = fbank_opts.use_energy;
|
||||||
|
opts.energy_floor = fbank_opts.energy_floor;
|
||||||
|
opts.raw_energy = fbank_opts.raw_energy;
|
||||||
|
opts.htk_compat = fbank_opts.htk_compat;
|
||||||
|
opts.use_log_fbank = fbank_opts.use_log_fbank;
|
||||||
|
opts.use_power = fbank_opts.use_power;
|
||||||
|
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
|
||||||
|
return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
|
||||||
|
wav);
|
||||||
|
}
|
||||||
|
|
||||||
|
py::array_t<float> ComputeFbank(
|
||||||
|
::kaldi::FrameExtractionOptions frame_opts,
|
||||||
|
::kaldi::MelBanksOptions mel_opts,
|
||||||
|
FbankOptions fbank_opts,
|
||||||
|
const py::array_t<float>& wav) {
|
||||||
|
InitFbank(frame_opts, mel_opts, fbank_opts);
|
||||||
|
py::array_t<float> result = ComputeFbankStreaming(wav);
|
||||||
|
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ResetFbank() {
|
||||||
|
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
|
||||||
|
}
|
||||||
|
|
||||||
|
py::array_t<float> ComputeKaldiPitch(
|
||||||
|
const ::kaldi::PitchExtractionOptions& opts,
|
||||||
|
const py::array_t<float>& wav) {
|
||||||
|
py::buffer_info info = wav.request();
|
||||||
|
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
|
||||||
|
|
||||||
|
::kaldi::Matrix<::kaldi::BaseFloat> features;
|
||||||
|
::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
|
||||||
|
auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
|
||||||
|
for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
|
||||||
|
std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
|
||||||
|
sizeof(float)*features.NumCols());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace kaldi
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,64 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <pybind11/numpy.h>
|
||||||
|
#include <pybind11/pybind11.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
|
||||||
|
#include "feat/pitch-functions.h"
|
||||||
|
|
||||||
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace kaldi {
|
||||||
|
|
||||||
|
struct FbankOptions{
|
||||||
|
bool use_energy; // append an extra dimension with energy to the filter banks
|
||||||
|
float energy_floor;
|
||||||
|
bool raw_energy; // If true, compute energy before preemphasis and windowing
|
||||||
|
bool htk_compat; // If true, put energy last (if using energy)
|
||||||
|
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
|
||||||
|
bool use_power;
|
||||||
|
FbankOptions(): use_energy(false),
|
||||||
|
energy_floor(0.0),
|
||||||
|
raw_energy(true),
|
||||||
|
htk_compat(false),
|
||||||
|
use_log_fbank(true),
|
||||||
|
use_power(true) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
bool InitFbank(
|
||||||
|
::kaldi::FrameExtractionOptions frame_opts,
|
||||||
|
::kaldi::MelBanksOptions mel_opts,
|
||||||
|
FbankOptions fbank_opts);
|
||||||
|
|
||||||
|
py::array_t<float> ComputeFbank(
|
||||||
|
::kaldi::FrameExtractionOptions frame_opts,
|
||||||
|
::kaldi::MelBanksOptions mel_opts,
|
||||||
|
FbankOptions fbank_opts,
|
||||||
|
const py::array_t<float>& wav);
|
||||||
|
|
||||||
|
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
|
||||||
|
|
||||||
|
void ResetFbank();
|
||||||
|
|
||||||
|
py::array_t<float> ComputeKaldiPitch(
|
||||||
|
const ::kaldi::PitchExtractionOptions& opts,
|
||||||
|
const py::array_t<float>& wav);
|
||||||
|
|
||||||
|
} // namespace kaldi
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,51 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace kaldi {
|
||||||
|
|
||||||
|
KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
|
||||||
|
static KaldiFeatureWrapper instance;
|
||||||
|
return &instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
|
||||||
|
fbank_.reset(new Fbank(opts));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
|
||||||
|
const py::array_t<float> wav) {
|
||||||
|
py::buffer_info info = wav.request();
|
||||||
|
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
|
||||||
|
|
||||||
|
::kaldi::Vector<::kaldi::BaseFloat> feats;
|
||||||
|
bool flag = fbank_->ComputeFeature(input_wav, &feats);
|
||||||
|
if (flag == false || feats.Dim() == 0) return py::array_t<float>();
|
||||||
|
auto result = py::array_t<float>(feats.Dim());
|
||||||
|
py::buffer_info xs = result.request();
|
||||||
|
std::cout << std::endl;
|
||||||
|
float* res_ptr = (float*)xs.ptr;
|
||||||
|
for (int idx = 0; idx < feats.Dim(); ++idx) {
|
||||||
|
*res_ptr = feats(idx);
|
||||||
|
res_ptr++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.reshape({feats.Dim() / Dim(), Dim()});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namesapce kaldi
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,40 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
#include "feat/feature-fbank.h"
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/kaldi/feature_common.h"
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace kaldi {
|
||||||
|
|
||||||
|
typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
|
||||||
|
|
||||||
|
class KaldiFeatureWrapper {
|
||||||
|
public:
|
||||||
|
static KaldiFeatureWrapper* GetInstance();
|
||||||
|
bool InitFbank(::kaldi::FbankOptions opts);
|
||||||
|
py::array_t<float> ComputeFbank(const py::array_t<float> wav);
|
||||||
|
int Dim() { return fbank_->Dim(); }
|
||||||
|
void ResetFbank() { fbank_->Reset(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace kaldi
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,148 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
|
||||||
|
#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
|
||||||
|
|
||||||
|
#ifdef INCLUDE_SOX
|
||||||
|
#include "paddleaudio/src/pybind/sox/io.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/effects.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <pybind11/stl.h>
|
||||||
|
#include <pybind11/pybind11.h>
|
||||||
|
|
||||||
|
// `tl::optional`
|
||||||
|
#ifdef INCLUDE_SOX
|
||||||
|
namespace pybind11 { namespace detail {
|
||||||
|
template <typename T>
|
||||||
|
struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
|
||||||
|
}}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PYBIND11_MODULE(_paddleaudio, m) {
|
||||||
|
#ifdef INCLUDE_SOX
|
||||||
|
m.def("get_info_file",
|
||||||
|
&paddleaudio::sox_io::get_info_file,
|
||||||
|
"Get metadata of audio file.");
|
||||||
|
// support obj later
|
||||||
|
m.def("get_info_fileobj",
|
||||||
|
&paddleaudio::sox_io::get_info_fileobj,
|
||||||
|
"Get metadata of audio in file object.");
|
||||||
|
m.def("load_audio_fileobj",
|
||||||
|
&paddleaudio::sox_io::load_audio_fileobj,
|
||||||
|
"Load audio from file object.");
|
||||||
|
m.def("save_audio_fileobj",
|
||||||
|
&paddleaudio::sox_io::save_audio_fileobj,
|
||||||
|
"Save audio to file obj.");
|
||||||
|
|
||||||
|
// sox io
|
||||||
|
m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
|
||||||
|
m.def(
|
||||||
|
"sox_io_load_audio_file",
|
||||||
|
&paddleaudio::sox_io::load_audio_file);
|
||||||
|
m.def(
|
||||||
|
"sox_io_save_audio_file",
|
||||||
|
&paddleaudio::sox_io::save_audio_file);
|
||||||
|
|
||||||
|
// sox utils
|
||||||
|
m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_set_verbosity",
|
||||||
|
&paddleaudio::sox_utils::set_verbosity);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_set_use_threads",
|
||||||
|
&paddleaudio::sox_utils::set_use_threads);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_set_buffer_size",
|
||||||
|
&paddleaudio::sox_utils::set_buffer_size);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_list_effects",
|
||||||
|
&paddleaudio::sox_utils::list_effects);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_list_read_formats",
|
||||||
|
&paddleaudio::sox_utils::list_read_formats);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_list_write_formats",
|
||||||
|
&paddleaudio::sox_utils::list_write_formats);
|
||||||
|
m.def(
|
||||||
|
"sox_utils_get_buffer_size",
|
||||||
|
&paddleaudio::sox_utils::get_buffer_size);
|
||||||
|
|
||||||
|
// effect
|
||||||
|
m.def("apply_effects_fileobj",
|
||||||
|
&paddleaudio::sox_effects::apply_effects_fileobj,
|
||||||
|
"Decode audio data from file-like obj and apply effects.");
|
||||||
|
m.def("sox_effects_initialize_sox_effects",
|
||||||
|
&paddleaudio::sox_effects::initialize_sox_effects);
|
||||||
|
m.def(
|
||||||
|
"sox_effects_shutdown_sox_effects",
|
||||||
|
&paddleaudio::sox_effects::shutdown_sox_effects);
|
||||||
|
m.def(
|
||||||
|
"sox_effects_apply_effects_tensor",
|
||||||
|
&paddleaudio::sox_effects::apply_effects_tensor);
|
||||||
|
m.def(
|
||||||
|
"sox_effects_apply_effects_file",
|
||||||
|
&paddleaudio::sox_effects::apply_effects_file);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef INCLUDE_KALDI
|
||||||
|
m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
|
||||||
|
py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
|
||||||
|
.def(py::init<>())
|
||||||
|
.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
|
||||||
|
.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
|
||||||
|
.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
|
||||||
|
.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
|
||||||
|
.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
|
||||||
|
.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
|
||||||
|
.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
|
||||||
|
.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
|
||||||
|
.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
|
||||||
|
.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
|
||||||
|
.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
|
||||||
|
.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
|
||||||
|
.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
|
||||||
|
.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
|
||||||
|
.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
|
||||||
|
.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
|
||||||
|
.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
|
||||||
|
.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
|
||||||
|
.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
|
||||||
|
.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
|
||||||
|
m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
|
||||||
|
py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
|
||||||
|
.def(py::init<>())
|
||||||
|
.def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
|
||||||
|
.def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)
|
||||||
|
.def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
|
||||||
|
.def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)
|
||||||
|
.def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)
|
||||||
|
.def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)
|
||||||
|
.def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
|
||||||
|
.def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)
|
||||||
|
.def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)
|
||||||
|
.def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
|
||||||
|
.def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
|
||||||
|
.def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
|
||||||
|
.def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
|
||||||
|
py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
|
||||||
|
.def(py::init<>())
|
||||||
|
.def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
|
||||||
|
.def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
|
||||||
|
.def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
|
||||||
|
.def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
|
||||||
|
.def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
|
||||||
|
.def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
|
||||||
|
.def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
|
||||||
|
|
||||||
|
py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
|
||||||
|
.def(py::init<>())
|
||||||
|
.def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
|
||||||
|
.def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
|
||||||
|
.def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
|
||||||
|
.def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
|
||||||
|
.def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
|
||||||
|
.def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,259 @@
|
|||||||
|
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp with modification.
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
#include <sox.h>
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/sox/effects.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/effects_chain.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/utils.h"
|
||||||
|
|
||||||
|
using namespace paddleaudio::sox_utils;
|
||||||
|
|
||||||
|
namespace paddleaudio::sox_effects {
|
||||||
|
|
||||||
|
// Streaming decoding over file-like object is tricky because libsox operates on
|
||||||
|
// FILE pointer. The folloing is what `sox` and `play` commands do
|
||||||
|
// - file input -> FILE pointer
|
||||||
|
// - URL input -> call wget in suprocess and pipe the data -> FILE pointer
|
||||||
|
// - stdin -> FILE pointer
|
||||||
|
//
|
||||||
|
// We want to, instead, fetch byte strings chunk by chunk, consume them, and
|
||||||
|
// discard.
|
||||||
|
//
|
||||||
|
// Here is the approach
|
||||||
|
// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
|
||||||
|
// chunk of byte string
|
||||||
|
// This will perform header-based format detection, if necessary, then fill
|
||||||
|
// the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
|
||||||
|
// which returns FILE* which points the buffer of the provided byte string.
|
||||||
|
// 2. Each time sox reads a chunk from the FILE*, we update the underlying
|
||||||
|
// buffer in a way that it
|
||||||
|
// starts with unseen data, and append the new data read from the given
|
||||||
|
// fileobj. This will trick libsox as if it keeps reading from the FILE*
|
||||||
|
// continuously.
|
||||||
|
// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
|
||||||
|
auto apply_effects_fileobj(
|
||||||
|
py::object fileobj,
|
||||||
|
const std::vector<std::vector<std::string>>& effects,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
tl::optional<std::string> format)
|
||||||
|
-> tl::optional<std::tuple<py::array, int64_t>> {
|
||||||
|
// Prepare the buffer used throughout the lifecycle of SoxEffectChain.
|
||||||
|
//
|
||||||
|
// For certain format (such as FLAC), libsox keeps reading the content at
|
||||||
|
// the initialization unless it reaches EOF even when the header is properly
|
||||||
|
// parsed. (Making buffer size 8192, which is way bigger than the header,
|
||||||
|
// resulted in libsox consuming all the buffer content at the time it opens
|
||||||
|
// the file.) Therefore buffer has to always contain valid data, except after
|
||||||
|
// EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
|
||||||
|
// first check if there is enough data to fill the buffer. `read_fileobj`
|
||||||
|
// repeatedly calls `read` method until it receives the requested length of
|
||||||
|
// bytes or it reaches EOF. If we get bytes shorter than requested, that means
|
||||||
|
// the whole audio data are fetched.
|
||||||
|
//
|
||||||
|
// * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
|
||||||
|
const auto capacity = [&]() {
|
||||||
|
// NOTE:
|
||||||
|
// Use the abstraction provided by `libpaddleaudio` to access the global
|
||||||
|
// config defined by libsox. Directly using `sox_get_globals` function will
|
||||||
|
// end up retrieving the static variable defined in `_paddleaudio`, which is
|
||||||
|
// not correct.
|
||||||
|
const auto bufsiz = get_buffer_size();
|
||||||
|
const int64_t kDefaultCapacityInBytes = 256;
|
||||||
|
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
|
||||||
|
: kDefaultCapacityInBytes;
|
||||||
|
}();
|
||||||
|
std::string buffer(capacity, '\0');
|
||||||
|
auto* in_buf = const_cast<char*>(buffer.data());
|
||||||
|
auto num_read = read_fileobj(&fileobj, capacity, in_buf);
|
||||||
|
// If the file is shorter than 256, then libsox cannot read the header.
|
||||||
|
auto in_buffer_size = (num_read > 256) ? num_read : 256;
|
||||||
|
|
||||||
|
// Open file (this starts reading the header)
|
||||||
|
// When opening a file there are two functions that can touches FILE*.
|
||||||
|
// * `auto_detect_format`
|
||||||
|
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
|
||||||
|
// * `startread` handler of detected format.
|
||||||
|
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
|
||||||
|
// To see the handler of a particular format, go to
|
||||||
|
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
|
||||||
|
// For example, voribs can be found
|
||||||
|
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
|
||||||
|
SoxFormat sf(sox_open_mem_read(
|
||||||
|
in_buf,
|
||||||
|
in_buffer_size,
|
||||||
|
/*signal=*/nullptr,
|
||||||
|
/*encoding=*/nullptr,
|
||||||
|
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||||
|
|
||||||
|
// In case of streamed data, length can be 0
|
||||||
|
if (static_cast<sox_format_t*>(sf) == nullptr ||
|
||||||
|
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare output buffer
|
||||||
|
std::vector<sox_sample_t> out_buffer;
|
||||||
|
out_buffer.reserve(sf->signal.length);
|
||||||
|
|
||||||
|
// Create and run SoxEffectsChain
|
||||||
|
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
|
||||||
|
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
|
||||||
|
/*input_encoding=*/sf->encoding,
|
||||||
|
/*output_encoding=*/get_tensor_encodinginfo(dtype));
|
||||||
|
chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
|
||||||
|
for (const auto& effect : effects) {
|
||||||
|
chain.addEffect(effect);
|
||||||
|
}
|
||||||
|
chain.addOutputBuffer(&out_buffer);
|
||||||
|
chain.run();
|
||||||
|
|
||||||
|
// Create tensor from buffer
|
||||||
|
bool channels_first_ = channels_first.value_or(true);
|
||||||
|
auto tensor = convert_to_tensor(
|
||||||
|
/*buffer=*/out_buffer.data(),
|
||||||
|
/*num_samples=*/out_buffer.size(),
|
||||||
|
/*num_channels=*/chain.getOutputNumChannels(),
|
||||||
|
dtype,
|
||||||
|
normalize.value_or(true),
|
||||||
|
channels_first_);
|
||||||
|
|
||||||
|
return std::forward_as_tuple(
|
||||||
|
tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
|
||||||
|
SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
|
||||||
|
std::mutex SOX_RESOUCE_STATE_MUTEX;
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void initialize_sox_effects() {
|
||||||
|
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
|
||||||
|
|
||||||
|
switch (SOX_RESOURCE_STATE) {
|
||||||
|
case NotInitialized:
|
||||||
|
if (sox_init() != SOX_SUCCESS) {
|
||||||
|
throw std::runtime_error("Failed to initialize sox effects.");
|
||||||
|
};
|
||||||
|
SOX_RESOURCE_STATE = Initialized;
|
||||||
|
break;
|
||||||
|
case Initialized:
|
||||||
|
break;
|
||||||
|
case ShutDown:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"SoX Effects has been shut down. Cannot initialize again.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void shutdown_sox_effects() {
|
||||||
|
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
|
||||||
|
|
||||||
|
switch (SOX_RESOURCE_STATE) {
|
||||||
|
case NotInitialized:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"SoX Effects is not initialized. Cannot shutdown.");
|
||||||
|
case Initialized:
|
||||||
|
if (sox_quit() != SOX_SUCCESS) {
|
||||||
|
throw std::runtime_error("Failed to initialize sox effects.");
|
||||||
|
};
|
||||||
|
SOX_RESOURCE_STATE = ShutDown;
|
||||||
|
break;
|
||||||
|
case ShutDown:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto apply_effects_tensor(
|
||||||
|
py::array waveform,
|
||||||
|
int64_t sample_rate,
|
||||||
|
const std::vector<std::vector<std::string>>& effects,
|
||||||
|
bool channels_first) -> std::tuple<py::array, int64_t> {
|
||||||
|
validate_input_tensor(waveform);
|
||||||
|
|
||||||
|
// Create SoxEffectsChain
|
||||||
|
const auto dtype = waveform.dtype();
|
||||||
|
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
|
||||||
|
/*input_encoding=*/get_tensor_encodinginfo(dtype),
|
||||||
|
/*output_encoding=*/get_tensor_encodinginfo(dtype));
|
||||||
|
|
||||||
|
// Prepare output buffer
|
||||||
|
std::vector<sox_sample_t> out_buffer;
|
||||||
|
out_buffer.reserve(waveform.size());
|
||||||
|
|
||||||
|
// Build and run effects chain
|
||||||
|
chain.addInputTensor(&waveform, sample_rate, channels_first);
|
||||||
|
for (const auto& effect : effects) {
|
||||||
|
chain.addEffect(effect);
|
||||||
|
}
|
||||||
|
chain.addOutputBuffer(&out_buffer);
|
||||||
|
chain.run();
|
||||||
|
|
||||||
|
// Create tensor from buffer
|
||||||
|
auto out_tensor = convert_to_tensor(
|
||||||
|
/*buffer=*/out_buffer.data(),
|
||||||
|
/*num_samples=*/out_buffer.size(),
|
||||||
|
/*num_channels=*/chain.getOutputNumChannels(),
|
||||||
|
dtype,
|
||||||
|
/*normalize=*/false,
|
||||||
|
channels_first);
|
||||||
|
|
||||||
|
return std::tuple<py::array, int64_t>(
|
||||||
|
out_tensor, chain.getOutputSampleRate());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto apply_effects_file(
|
||||||
|
const std::string& path,
|
||||||
|
const std::vector<std::vector<std::string>>& effects,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
const tl::optional<std::string>& format)
|
||||||
|
-> tl::optional<std::tuple<py::array, int64_t>> {
|
||||||
|
// Open input file
|
||||||
|
SoxFormat sf(sox_open_read(
|
||||||
|
path.c_str(),
|
||||||
|
/*signal=*/nullptr,
|
||||||
|
/*encoding=*/nullptr,
|
||||||
|
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||||
|
|
||||||
|
if (static_cast<sox_format_t*>(sf) == nullptr ||
|
||||||
|
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
|
||||||
|
|
||||||
|
// Prepare output
|
||||||
|
std::vector<sox_sample_t> out_buffer;
|
||||||
|
out_buffer.reserve(sf->signal.length);
|
||||||
|
|
||||||
|
// Create and run SoxEffectsChain
|
||||||
|
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
|
||||||
|
/*input_encoding=*/sf->encoding,
|
||||||
|
/*output_encoding=*/get_tensor_encodinginfo(dtype));
|
||||||
|
|
||||||
|
chain.addInputFile(sf);
|
||||||
|
for (const auto& effect : effects) {
|
||||||
|
chain.addEffect(effect);
|
||||||
|
}
|
||||||
|
chain.addOutputBuffer(&out_buffer);
|
||||||
|
chain.run();
|
||||||
|
|
||||||
|
// Create tensor from buffer
|
||||||
|
bool channels_first_ = channels_first.value_or(true);
|
||||||
|
auto tensor = convert_to_tensor(
|
||||||
|
/*buffer=*/out_buffer.data(),
|
||||||
|
/*num_samples=*/out_buffer.size(),
|
||||||
|
/*num_channels=*/chain.getOutputNumChannels(),
|
||||||
|
dtype,
|
||||||
|
normalize.value_or(true),
|
||||||
|
channels_first_);
|
||||||
|
|
||||||
|
return std::tuple<py::array, int64_t>(
|
||||||
|
tensor, chain.getOutputSampleRate());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace paddleaudio::sox_effects
|
@ -0,0 +1,37 @@
|
|||||||
|
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h with modification.
|
||||||
|
#include <pybind11/pybind11.h>
|
||||||
|
#include <pybind11/numpy.h>
|
||||||
|
|
||||||
|
#include "paddleaudio/src/optional/optional.hpp"
|
||||||
|
|
||||||
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
namespace paddleaudio::sox_effects {
|
||||||
|
|
||||||
|
auto apply_effects_fileobj(
|
||||||
|
py::object fileobj,
|
||||||
|
const std::vector<std::vector<std::string>>& effects,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
tl::optional<std::string> format)
|
||||||
|
-> tl::optional<std::tuple<py::array, int64_t>>;
|
||||||
|
|
||||||
|
void initialize_sox_effects();
|
||||||
|
|
||||||
|
void shutdown_sox_effects();
|
||||||
|
|
||||||
|
auto apply_effects_tensor(
|
||||||
|
py::array waveform,
|
||||||
|
int64_t sample_rate,
|
||||||
|
const std::vector<std::vector<std::string>>& effects,
|
||||||
|
bool channels_first) -> std::tuple<py::array, int64_t>;
|
||||||
|
|
||||||
|
auto apply_effects_file(
|
||||||
|
const std::string& path,
|
||||||
|
const std::vector<std::vector<std::string>>& effects,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
const tl::optional<std::string>& format)
|
||||||
|
-> tl::optional<std::tuple<py::array, int64_t>>;
|
||||||
|
|
||||||
|
} // namespace paddleaudio::sox_effects
|
@ -0,0 +1,597 @@
|
|||||||
|
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp with modification.
|
||||||
|
|
||||||
|
#include <sox.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include "paddleaudio/src/pybind/sox/effects_chain.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/utils.h"
|
||||||
|
|
||||||
|
using namespace paddleaudio::sox_utils;
|
||||||
|
|
||||||
|
namespace paddleaudio::sox_effects_chain {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
/// helper classes for passing the location of input tensor and output buffer
|
||||||
|
///
|
||||||
|
/// drain/flow callback functions require plaing C style function signature and
|
||||||
|
/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
|
||||||
|
/// The following structs will be assigned to sox_effect_t::priv pointer which
|
||||||
|
/// gives sox_effect_t an access to input Tensor and output buffer object.
|
||||||
|
struct TensorInputPriv {
|
||||||
|
size_t index;
|
||||||
|
py::array* waveform;
|
||||||
|
int64_t sample_rate;
|
||||||
|
bool channels_first;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TensorOutputPriv {
|
||||||
|
std::vector<sox_sample_t>* buffer;
|
||||||
|
};
|
||||||
|
struct FileOutputPriv {
|
||||||
|
sox_format_t* sf;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Callback function to feed Tensor data to SoxEffectChain.
|
||||||
|
int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
|
||||||
|
// Retrieve the input Tensor and current index
|
||||||
|
auto priv = static_cast<TensorInputPriv*>(effp->priv);
|
||||||
|
auto index = priv->index;
|
||||||
|
auto tensor = *(priv->waveform);
|
||||||
|
auto num_channels = effp->out_signal.channels;
|
||||||
|
|
||||||
|
// Adjust the number of samples to read
|
||||||
|
const size_t num_samples = tensor.size();
|
||||||
|
if (index + *osamp > num_samples) {
|
||||||
|
*osamp = num_samples - index;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that it's a multiple of the number of channels
|
||||||
|
*osamp -= *osamp % num_channels;
|
||||||
|
|
||||||
|
// Slice the input Tensor
|
||||||
|
// refacor this module, chunk
|
||||||
|
auto i_frame = index / num_channels;
|
||||||
|
auto num_frames = *osamp / num_channels;
|
||||||
|
|
||||||
|
std::vector<int> chunk(num_frames*num_channels);
|
||||||
|
py::buffer_info ori_info = tensor.request();
|
||||||
|
void* ptr = ori_info.ptr;
|
||||||
|
// Convert to sox_sample_t (int32_t)
|
||||||
|
switch (tensor.dtype().num()) {
|
||||||
|
//case c10::ScalarType::Float: {
|
||||||
|
case 11: {
|
||||||
|
// Need to convert to 64-bit precision so that
|
||||||
|
// values around INT32_MIN/MAX are handled correctly.
|
||||||
|
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||||
|
int frame_idx = (idx + index) / num_channels;
|
||||||
|
int channels_idx = (idx + index) % num_channels;
|
||||||
|
double elem = 0;
|
||||||
|
if (priv->channels_first) {
|
||||||
|
elem = *(float*)tensor.data(channels_idx, frame_idx);
|
||||||
|
} else {
|
||||||
|
elem = *(float*)tensor.data(frame_idx, channels_idx);
|
||||||
|
}
|
||||||
|
elem = elem * 2147483648.;
|
||||||
|
// *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
|
||||||
|
if (elem > INT32_MAX) {
|
||||||
|
chunk[idx] = INT32_MAX;
|
||||||
|
} else if (elem < INT32_MIN) {
|
||||||
|
chunk[idx] = INT32_MIN;
|
||||||
|
} else {
|
||||||
|
chunk[idx] = elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//case c10::ScalarType::Int: {
|
||||||
|
case 5: {
|
||||||
|
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||||
|
int frame_idx = (idx + index) / num_channels;
|
||||||
|
int channels_idx = (idx + index) % num_channels;
|
||||||
|
int elem = 0;
|
||||||
|
if (priv->channels_first) {
|
||||||
|
elem = *(int*)tensor.data(channels_idx, frame_idx);
|
||||||
|
} else {
|
||||||
|
elem = *(int*)tensor.data(frame_idx, channels_idx);
|
||||||
|
}
|
||||||
|
chunk[idx] = elem;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// case short
|
||||||
|
case 3: {
|
||||||
|
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||||
|
int frame_idx = (idx + index) / num_channels;
|
||||||
|
int channels_idx = (idx + index) % num_channels;
|
||||||
|
int16_t elem = 0;
|
||||||
|
if (priv->channels_first) {
|
||||||
|
elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
|
||||||
|
} else {
|
||||||
|
elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
|
||||||
|
}
|
||||||
|
chunk[idx] = elem * 65536;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// case byte
|
||||||
|
case 1: {
|
||||||
|
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||||
|
int frame_idx = (idx + index) / num_channels;
|
||||||
|
int channels_idx = (idx + index) % num_channels;
|
||||||
|
int8_t elem = 0;
|
||||||
|
if (priv->channels_first) {
|
||||||
|
elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
|
||||||
|
} else {
|
||||||
|
elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
|
||||||
|
}
|
||||||
|
chunk[idx] = (elem - 128) * 16777216;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unexpected dtype.");
|
||||||
|
}
|
||||||
|
// Write to buffer
|
||||||
|
memcpy(obuf, chunk.data(), *osamp * 4);
|
||||||
|
priv->index += *osamp;
|
||||||
|
return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Callback function to fetch data from SoxEffectChain.
|
||||||
|
int tensor_output_flow(
|
||||||
|
sox_effect_t* effp,
|
||||||
|
sox_sample_t const* ibuf,
|
||||||
|
sox_sample_t* obuf LSX_UNUSED,
|
||||||
|
size_t* isamp,
|
||||||
|
size_t* osamp) {
|
||||||
|
*osamp = 0;
|
||||||
|
// Get output buffer
|
||||||
|
auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
|
||||||
|
// Append at the end
|
||||||
|
out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
|
||||||
|
return SOX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int file_output_flow(
|
||||||
|
sox_effect_t* effp,
|
||||||
|
sox_sample_t const* ibuf,
|
||||||
|
sox_sample_t* obuf LSX_UNUSED,
|
||||||
|
size_t* isamp,
|
||||||
|
size_t* osamp) {
|
||||||
|
*osamp = 0;
|
||||||
|
if (*isamp) {
|
||||||
|
auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
|
||||||
|
if (sox_write(sf, ibuf, *isamp) != *isamp) {
|
||||||
|
if (sf->sox_errno) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
|
||||||
|
<< sf->filename;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
return SOX_EOF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return SOX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
sox_effect_handler_t* get_tensor_input_handler() {
|
||||||
|
static sox_effect_handler_t handler{
|
||||||
|
/*name=*/"input_tensor",
|
||||||
|
/*usage=*/NULL,
|
||||||
|
/*flags=*/SOX_EFF_MCHAN,
|
||||||
|
/*getopts=*/NULL,
|
||||||
|
/*start=*/NULL,
|
||||||
|
/*flow=*/NULL,
|
||||||
|
/*drain=*/tensor_input_drain,
|
||||||
|
/*stop=*/NULL,
|
||||||
|
/*kill=*/NULL,
|
||||||
|
/*priv_size=*/sizeof(TensorInputPriv)};
|
||||||
|
return &handler;
|
||||||
|
}
|
||||||
|
|
||||||
|
sox_effect_handler_t* get_tensor_output_handler() {
|
||||||
|
static sox_effect_handler_t handler{
|
||||||
|
/*name=*/"output_tensor",
|
||||||
|
/*usage=*/NULL,
|
||||||
|
/*flags=*/SOX_EFF_MCHAN,
|
||||||
|
/*getopts=*/NULL,
|
||||||
|
/*start=*/NULL,
|
||||||
|
/*flow=*/tensor_output_flow,
|
||||||
|
/*drain=*/NULL,
|
||||||
|
/*stop=*/NULL,
|
||||||
|
/*kill=*/NULL,
|
||||||
|
/*priv_size=*/sizeof(TensorOutputPriv)};
|
||||||
|
return &handler;
|
||||||
|
}
|
||||||
|
|
||||||
|
sox_effect_handler_t* get_file_output_handler() {
|
||||||
|
static sox_effect_handler_t handler{
|
||||||
|
/*name=*/"output_file",
|
||||||
|
/*usage=*/NULL,
|
||||||
|
/*flags=*/SOX_EFF_MCHAN,
|
||||||
|
/*getopts=*/NULL,
|
||||||
|
/*start=*/NULL,
|
||||||
|
/*flow=*/file_output_flow,
|
||||||
|
/*drain=*/NULL,
|
||||||
|
/*stop=*/NULL,
|
||||||
|
/*kill=*/NULL,
|
||||||
|
/*priv_size=*/sizeof(FileOutputPriv)};
|
||||||
|
return &handler;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
|
||||||
|
|
||||||
|
SoxEffect::~SoxEffect() {
|
||||||
|
if (se_ != nullptr) {
|
||||||
|
free(se_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SoxEffect::operator sox_effect_t*() const {
|
||||||
|
return se_;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto SoxEffect::operator->() noexcept -> sox_effect_t* {
|
||||||
|
return se_;
|
||||||
|
}
|
||||||
|
|
||||||
|
SoxEffectsChain::SoxEffectsChain(
|
||||||
|
sox_encodinginfo_t input_encoding,
|
||||||
|
sox_encodinginfo_t output_encoding)
|
||||||
|
: in_enc_(input_encoding),
|
||||||
|
out_enc_(output_encoding),
|
||||||
|
in_sig_(),
|
||||||
|
interm_sig_(),
|
||||||
|
out_sig_(),
|
||||||
|
sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
|
||||||
|
if (!sec_) {
|
||||||
|
throw std::runtime_error("Failed to create effect chain.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SoxEffectsChain::~SoxEffectsChain() {
|
||||||
|
if (sec_ != nullptr) {
|
||||||
|
sox_delete_effects_chain(sec_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChain::run() {
|
||||||
|
sox_flow_effects(sec_, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChain::addInputTensor(
|
||||||
|
py::array* waveform,
|
||||||
|
int64_t sample_rate,
|
||||||
|
bool channels_first) {
|
||||||
|
in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
|
||||||
|
interm_sig_ = in_sig_;
|
||||||
|
SoxEffect e(sox_create_effect(get_tensor_input_handler()));
|
||||||
|
auto priv = static_cast<TensorInputPriv*>(e->priv);
|
||||||
|
priv->index = 0;
|
||||||
|
priv->waveform = waveform;
|
||||||
|
priv->sample_rate = sample_rate;
|
||||||
|
priv->channels_first = channels_first;
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Internal Error: Failed to add effect: input_tensor");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChain::addOutputBuffer(
|
||||||
|
std::vector<sox_sample_t>* output_buffer) {
|
||||||
|
SoxEffect e(sox_create_effect(get_tensor_output_handler()));
|
||||||
|
static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Internal Error: Failed to add effect: output_tensor");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChain::addInputFile(sox_format_t* sf) {
|
||||||
|
in_sig_ = sf->signal;
|
||||||
|
interm_sig_ = in_sig_;
|
||||||
|
SoxEffect e(sox_create_effect(sox_find_effect("input")));
|
||||||
|
char* opts[] = {(char*)sf};
|
||||||
|
sox_effect_options(e, 1, opts);
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Internal Error: Failed to add effect: input " << sf->filename;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
|
||||||
|
out_sig_ = sf->signal;
|
||||||
|
SoxEffect e(sox_create_effect(get_file_output_handler()));
|
||||||
|
static_cast<FileOutputPriv*>(e->priv)->sf = sf;
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Internal Error: Failed to add effect: output " << sf->filename;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
|
||||||
|
const auto num_args = effect.size();
|
||||||
|
if (num_args == 0) {
|
||||||
|
throw std::runtime_error("Invalid argument: empty effect.");
|
||||||
|
}
|
||||||
|
const auto name = effect[0];
|
||||||
|
if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Unsupported effect: " << name;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto returned_effect = sox_find_effect(name.c_str());
|
||||||
|
if (!returned_effect) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Unsupported effect: " << name;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
SoxEffect e(sox_create_effect(returned_effect));
|
||||||
|
const auto num_options = num_args - 1;
|
||||||
|
|
||||||
|
std::vector<char*> opts;
|
||||||
|
for (size_t i = 1; i < num_args; ++i) {
|
||||||
|
opts.push_back((char*)effect[i].c_str());
|
||||||
|
}
|
||||||
|
if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
|
||||||
|
SOX_SUCCESS) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Invalid effect option:";
|
||||||
|
for (const auto& v : effect) {
|
||||||
|
stream << " " << v;
|
||||||
|
}
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Internal Error: Failed to add effect: \"" << name;
|
||||||
|
for (size_t i = 1; i < num_args; ++i) {
|
||||||
|
stream << " " << effect[i];
|
||||||
|
}
|
||||||
|
stream << "\"";
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t SoxEffectsChain::getOutputNumChannels() {
|
||||||
|
return interm_sig_.channels;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t SoxEffectsChain::getOutputSampleRate() {
|
||||||
|
return interm_sig_.rate;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
/// helper classes for passing file-like object to SoxEffectChain
|
||||||
|
struct FileObjInputPriv {
|
||||||
|
sox_format_t* sf;
|
||||||
|
py::object* fileobj;
|
||||||
|
bool eof_reached;
|
||||||
|
char* buffer;
|
||||||
|
uint64_t buffer_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FileObjOutputPriv {
|
||||||
|
sox_format_t* sf;
|
||||||
|
py::object* fileobj;
|
||||||
|
char** buffer;
|
||||||
|
size_t* buffer_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Callback function to feed byte string
|
||||||
|
/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
|
||||||
|
auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
|
||||||
|
-> int {
|
||||||
|
auto priv = static_cast<FileObjInputPriv*>(effp->priv);
|
||||||
|
auto sf = priv->sf;
|
||||||
|
auto buffer = priv->buffer;
|
||||||
|
|
||||||
|
// 1. Refresh the buffer
|
||||||
|
//
|
||||||
|
// NOTE:
|
||||||
|
// Since the underlying FILE* was opened with `fmemopen`, the only way
|
||||||
|
// libsox detect EOF is reaching the end of the buffer. (null byte won't
|
||||||
|
// help) Therefore we need to align the content at the end of buffer,
|
||||||
|
// otherwise, libsox will keep reading the content beyond intended length.
|
||||||
|
//
|
||||||
|
// Before:
|
||||||
|
//
|
||||||
|
// |<-------consumed------>|<---remaining--->|
|
||||||
|
// |***********************|-----------------|
|
||||||
|
// ^ ftell
|
||||||
|
//
|
||||||
|
// After:
|
||||||
|
//
|
||||||
|
// |<-offset->|<---remaining--->|<-new data->|
|
||||||
|
// |**********|-----------------|++++++++++++|
|
||||||
|
// ^ ftell
|
||||||
|
|
||||||
|
// NOTE:
|
||||||
|
// Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
|
||||||
|
// supposed to be in sync, but there are cases (Vorbis) they are not
|
||||||
|
// in sync and `tell_off` has seemingly uninitialized value, which
|
||||||
|
// leads num_remain to be negative and cause segmentation fault
|
||||||
|
// in `memmove`.
|
||||||
|
const auto tell = ftell((FILE*)sf->fp);
|
||||||
|
if (tell < 0) {
|
||||||
|
throw std::runtime_error("Internal Error: ftell failed.");
|
||||||
|
}
|
||||||
|
const auto num_consumed = static_cast<size_t>(tell);
|
||||||
|
if (num_consumed > priv->buffer_size) {
|
||||||
|
throw std::runtime_error("Internal Error: buffer overrun.");
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto num_remain = priv->buffer_size - num_consumed;
|
||||||
|
|
||||||
|
// 1.1. Fetch the data to see if there is data to fill the buffer
|
||||||
|
size_t num_refill = 0;
|
||||||
|
std::string chunk(num_consumed, '\0');
|
||||||
|
if (num_consumed && !priv->eof_reached) {
|
||||||
|
num_refill = read_fileobj(
|
||||||
|
priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
|
||||||
|
if (num_refill < num_consumed) {
|
||||||
|
priv->eof_reached = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const auto offset = num_consumed - num_refill;
|
||||||
|
|
||||||
|
// 1.2. Move the unconsumed data towards the beginning of buffer.
|
||||||
|
if (num_remain) {
|
||||||
|
auto src = static_cast<void*>(buffer + num_consumed);
|
||||||
|
auto dst = static_cast<void*>(buffer + offset);
|
||||||
|
memmove(dst, src, num_remain);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1.3. Refill the remaining buffer.
|
||||||
|
if (num_refill) {
|
||||||
|
auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
|
||||||
|
auto dst = buffer + offset + num_remain;
|
||||||
|
memcpy(dst, src, num_refill);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1.4. Set the file pointer to the new offset
|
||||||
|
sf->tell_off = offset;
|
||||||
|
fseek((FILE*)sf->fp, offset, SEEK_SET);
|
||||||
|
|
||||||
|
// 2. Perform decoding operation
|
||||||
|
// The following part is practically same as "input" effect
|
||||||
|
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
|
||||||
|
|
||||||
|
// At this point, osamp represents the buffer size in bytes,
|
||||||
|
// but sox_read expects the maximum number of samples ready to read.
|
||||||
|
// Normally, this is fine, but in case when the samples are not 4-byte
|
||||||
|
// aligned, (e.g. sample is 24bits), the resulting signal is not correct.
|
||||||
|
// https://github.com/pytorch/audio/issues/2083
|
||||||
|
if (sf->encoding.bits_per_sample > 0)
|
||||||
|
*osamp /= (sf->encoding.bits_per_sample / 8);
|
||||||
|
|
||||||
|
// Ensure that it's a multiple of the number of channels
|
||||||
|
*osamp -= *osamp % effp->out_signal.channels;
|
||||||
|
|
||||||
|
// Read up to *osamp samples into obuf;
|
||||||
|
// store the actual number read back to *osamp
|
||||||
|
*osamp = sox_read(sf, obuf, *osamp);
|
||||||
|
|
||||||
|
// Decoding is finished when fileobject is exhausted and sox can no longer
|
||||||
|
// decode a sample.
|
||||||
|
return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fileobj_output_flow(
|
||||||
|
sox_effect_t* effp,
|
||||||
|
sox_sample_t const* ibuf,
|
||||||
|
sox_sample_t* obuf LSX_UNUSED,
|
||||||
|
size_t* isamp,
|
||||||
|
size_t* osamp) -> int {
|
||||||
|
*osamp = 0;
|
||||||
|
if (*isamp) {
|
||||||
|
auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
|
||||||
|
auto sf = priv->sf;
|
||||||
|
auto fp = static_cast<FILE*>(sf->fp);
|
||||||
|
auto fileobj = priv->fileobj;
|
||||||
|
auto buffer = priv->buffer;
|
||||||
|
|
||||||
|
// Encode chunk
|
||||||
|
auto num_samples_written = sox_write(sf, ibuf, *isamp);
|
||||||
|
fflush(fp);
|
||||||
|
|
||||||
|
// Copy the encoded chunk to python object.
|
||||||
|
fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
|
||||||
|
|
||||||
|
// Reset FILE*
|
||||||
|
sf->tell_off = 0;
|
||||||
|
fseek(fp, 0, SEEK_SET);
|
||||||
|
|
||||||
|
if (num_samples_written != *isamp) {
|
||||||
|
if (sf->sox_errno) {
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
|
||||||
|
<< sf->filename;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
return SOX_EOF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return SOX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto get_fileobj_input_handler() -> sox_effect_handler_t* {
|
||||||
|
static sox_effect_handler_t handler{
|
||||||
|
/*name=*/"input_fileobj_object",
|
||||||
|
/*usage=*/nullptr,
|
||||||
|
/*flags=*/SOX_EFF_MCHAN,
|
||||||
|
/*getopts=*/nullptr,
|
||||||
|
/*start=*/nullptr,
|
||||||
|
/*flow=*/nullptr,
|
||||||
|
/*drain=*/fileobj_input_drain,
|
||||||
|
/*stop=*/nullptr,
|
||||||
|
/*kill=*/nullptr,
|
||||||
|
/*priv_size=*/sizeof(FileObjInputPriv)};
|
||||||
|
return &handler;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto get_fileobj_output_handler() -> sox_effect_handler_t* {
|
||||||
|
static sox_effect_handler_t handler{
|
||||||
|
/*name=*/"output_fileobj_object",
|
||||||
|
/*usage=*/nullptr,
|
||||||
|
/*flags=*/SOX_EFF_MCHAN,
|
||||||
|
/*getopts=*/nullptr,
|
||||||
|
/*start=*/nullptr,
|
||||||
|
/*flow=*/fileobj_output_flow,
|
||||||
|
/*drain=*/nullptr,
|
||||||
|
/*stop=*/nullptr,
|
||||||
|
/*kill=*/nullptr,
|
||||||
|
/*priv_size=*/sizeof(FileObjOutputPriv)};
|
||||||
|
return &handler;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void SoxEffectsChainPyBind::addInputFileObj(
|
||||||
|
sox_format_t* sf,
|
||||||
|
char* buffer,
|
||||||
|
uint64_t buffer_size,
|
||||||
|
py::object* fileobj) {
|
||||||
|
in_sig_ = sf->signal;
|
||||||
|
interm_sig_ = in_sig_;
|
||||||
|
|
||||||
|
SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
|
||||||
|
auto priv = static_cast<FileObjInputPriv*>(e->priv);
|
||||||
|
priv->sf = sf;
|
||||||
|
priv->fileobj = fileobj;
|
||||||
|
priv->eof_reached = false;
|
||||||
|
priv->buffer = buffer;
|
||||||
|
priv->buffer_size = buffer_size;
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Internal Error: Failed to add effect: input fileobj");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxEffectsChainPyBind::addOutputFileObj(
|
||||||
|
sox_format_t* sf,
|
||||||
|
char** buffer,
|
||||||
|
size_t* buffer_size,
|
||||||
|
py::object* fileobj) {
|
||||||
|
out_sig_ = sf->signal;
|
||||||
|
SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
|
||||||
|
auto priv = static_cast<FileObjOutputPriv*>(e->priv);
|
||||||
|
priv->sf = sf;
|
||||||
|
priv->fileobj = fileobj;
|
||||||
|
priv->buffer = buffer;
|
||||||
|
priv->buffer_size = buffer_size;
|
||||||
|
if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Internal Error: Failed to add effect: output fileobj");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace paddleaudio::sox_effects_chain
|
@ -0,0 +1,78 @@
|
|||||||
|
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h with modification.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <sox.h>
|
||||||
|
#include "paddleaudio/src/pybind/sox/utils.h"
|
||||||
|
|
||||||
|
namespace paddleaudio::sox_effects_chain {
|
||||||
|
|
||||||
|
// Helper struct to safely close sox_effect_t* pointer returned by
|
||||||
|
// sox_create_effect
|
||||||
|
|
||||||
|
struct SoxEffect {
|
||||||
|
explicit SoxEffect(sox_effect_t* se) noexcept;
|
||||||
|
SoxEffect(const SoxEffect& other) = delete;
|
||||||
|
SoxEffect(const SoxEffect&& other) = delete;
|
||||||
|
auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
|
||||||
|
auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
|
||||||
|
~SoxEffect();
|
||||||
|
operator sox_effect_t*() const;
|
||||||
|
auto operator->() noexcept -> sox_effect_t*;
|
||||||
|
|
||||||
|
private:
|
||||||
|
sox_effect_t* se_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper struct to safely close sox_effects_chain_t with handy methods
|
||||||
|
class SoxEffectsChain {
|
||||||
|
const sox_encodinginfo_t in_enc_;
|
||||||
|
const sox_encodinginfo_t out_enc_;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
sox_signalinfo_t in_sig_;
|
||||||
|
sox_signalinfo_t interm_sig_;
|
||||||
|
sox_signalinfo_t out_sig_;
|
||||||
|
sox_effects_chain_t* sec_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit SoxEffectsChain(
|
||||||
|
sox_encodinginfo_t input_encoding,
|
||||||
|
sox_encodinginfo_t output_encoding);
|
||||||
|
SoxEffectsChain(const SoxEffectsChain& other) = delete;
|
||||||
|
SoxEffectsChain(const SoxEffectsChain&& other) = delete;
|
||||||
|
SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
|
||||||
|
SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
|
||||||
|
~SoxEffectsChain();
|
||||||
|
void run();
|
||||||
|
void addInputTensor(
|
||||||
|
py::array* waveform,
|
||||||
|
int64_t sample_rate,
|
||||||
|
bool channels_first);
|
||||||
|
void addInputFile(sox_format_t* sf);
|
||||||
|
void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
|
||||||
|
void addOutputFile(sox_format_t* sf);
|
||||||
|
void addEffect(const std::vector<std::string> effect);
|
||||||
|
int64_t getOutputNumChannels();
|
||||||
|
int64_t getOutputSampleRate();
|
||||||
|
};
|
||||||
|
|
||||||
|
class SoxEffectsChainPyBind : public SoxEffectsChain {
|
||||||
|
using SoxEffectsChain::SoxEffectsChain;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void addInputFileObj(
|
||||||
|
sox_format_t* sf,
|
||||||
|
char* buffer,
|
||||||
|
uint64_t buffer_size,
|
||||||
|
py::object* fileobj);
|
||||||
|
|
||||||
|
void addOutputFileObj(
|
||||||
|
sox_format_t* sf,
|
||||||
|
char** buffer,
|
||||||
|
size_t* buffer_size,
|
||||||
|
py::object* fileobj);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace paddleaudio::sox_effects_chain
|
||||||
|
|
@ -0,0 +1,279 @@
|
|||||||
|
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp with modification.
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/sox/io.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/effects.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/types.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/effects_chain.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/utils.h"
|
||||||
|
#include "paddleaudio/src/optional/optional.hpp"
|
||||||
|
|
||||||
|
using namespace paddleaudio::sox_utils;
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace sox_io {
|
||||||
|
|
||||||
|
auto get_info_file(const std::string &path,
|
||||||
|
const tl::optional<std::string> &format)
|
||||||
|
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
|
||||||
|
SoxFormat sf(
|
||||||
|
sox_open_read(path.data(),
|
||||||
|
/*signal=*/nullptr,
|
||||||
|
/*encoding=*/nullptr,
|
||||||
|
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||||
|
|
||||||
|
|
||||||
|
validate_input_file(sf, path);
|
||||||
|
|
||||||
|
return std::make_tuple(
|
||||||
|
static_cast<int64_t>(sf->signal.rate),
|
||||||
|
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
|
||||||
|
static_cast<int64_t>(sf->signal.channels),
|
||||||
|
static_cast<int64_t>(sf->encoding.bits_per_sample),
|
||||||
|
get_encoding(sf->encoding.encoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::string>> get_effects(
|
||||||
|
const tl::optional<int64_t>& frame_offset,
|
||||||
|
const tl::optional<int64_t>& num_frames) {
|
||||||
|
const auto offset = frame_offset.value_or(0);
|
||||||
|
if (offset < 0) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Invalid argument: frame_offset must be non-negative.");
|
||||||
|
}
|
||||||
|
const auto frames = num_frames.value_or(-1);
|
||||||
|
if (frames == 0 || frames < -1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Invalid argument: num_frames must be -1 or greater than 0.");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::string>> effects;
|
||||||
|
if (frames != -1) {
|
||||||
|
std::ostringstream os_offset, os_frames;
|
||||||
|
os_offset << offset << "s";
|
||||||
|
os_frames << "+" << frames << "s";
|
||||||
|
effects.emplace_back(
|
||||||
|
std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
|
||||||
|
} else if (offset != 0) {
|
||||||
|
std::ostringstream os_offset;
|
||||||
|
os_offset << offset << "s";
|
||||||
|
effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
|
||||||
|
}
|
||||||
|
return effects;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto get_info_fileobj(py::object fileobj,
|
||||||
|
const tl::optional<std::string> &format)
|
||||||
|
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
|
||||||
|
const auto capacity = [&]() {
|
||||||
|
const auto bufsiz = get_buffer_size();
|
||||||
|
const int64_t kDefaultCapacityInBytes = 4096;
|
||||||
|
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
|
||||||
|
: kDefaultCapacityInBytes;
|
||||||
|
}();
|
||||||
|
std::string buffer(capacity, '\0');
|
||||||
|
auto *buf = const_cast<char *>(buffer.data());
|
||||||
|
auto num_read = read_fileobj(&fileobj, capacity, buf);
|
||||||
|
// If the file is shorter than 256, then libsox cannot read the header.
|
||||||
|
auto buf_size = (num_read > 256) ? num_read : 256;
|
||||||
|
|
||||||
|
SoxFormat sf(sox_open_mem_read(
|
||||||
|
buf,
|
||||||
|
buf_size,
|
||||||
|
/*signal=*/nullptr,
|
||||||
|
/*encoding=*/nullptr,
|
||||||
|
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||||
|
|
||||||
|
// In case of streamed data, length can be 0
|
||||||
|
validate_input_memfile(sf);
|
||||||
|
|
||||||
|
return std::make_tuple(
|
||||||
|
static_cast<int64_t>(sf->signal.rate),
|
||||||
|
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
|
||||||
|
static_cast<int64_t>(sf->signal.channels),
|
||||||
|
static_cast<int64_t>(sf->encoding.bits_per_sample),
|
||||||
|
get_encoding(sf->encoding.encoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
|
||||||
|
py::object fileobj,
|
||||||
|
const tl::optional<int64_t>& frame_offset,
|
||||||
|
const tl::optional<int64_t>& num_frames,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
const tl::optional<std::string>& format) {
|
||||||
|
auto effects = get_effects(frame_offset, num_frames);
|
||||||
|
return paddleaudio::sox_effects::apply_effects_fileobj(
|
||||||
|
std::move(fileobj), effects, normalize, channels_first, std::move(format));
|
||||||
|
}
|
||||||
|
|
||||||
|
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
|
||||||
|
const std::string& path,
|
||||||
|
const tl::optional<int64_t>& frame_offset,
|
||||||
|
const tl::optional<int64_t>& num_frames,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
const tl::optional<std::string>& format) {
|
||||||
|
auto effects = get_effects(frame_offset, num_frames);
|
||||||
|
return paddleaudio::sox_effects::apply_effects_file(
|
||||||
|
path, effects, normalize, channels_first, format);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save_audio_file(const std::string& path,
|
||||||
|
py::array tensor,
|
||||||
|
int64_t sample_rate,
|
||||||
|
bool channels_first,
|
||||||
|
tl::optional<double> compression,
|
||||||
|
tl::optional<std::string> format,
|
||||||
|
tl::optional<std::string> encoding,
|
||||||
|
tl::optional<int64_t> bits_per_sample) {
|
||||||
|
validate_input_tensor(tensor);
|
||||||
|
|
||||||
|
const auto filetype = [&]() {
|
||||||
|
if (format.has_value()) return format.value();
|
||||||
|
return get_filetype(path);
|
||||||
|
}();
|
||||||
|
|
||||||
|
if (filetype == "amr-nb") {
|
||||||
|
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||||
|
//TORCH_CHECK(num_channels == 1,
|
||||||
|
// "amr-nb format only supports single channel audio.");
|
||||||
|
assert(num_channels == 1);
|
||||||
|
} else if (filetype == "htk") {
|
||||||
|
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||||
|
// TORCH_CHECK(num_channels == 1,
|
||||||
|
// "htk format only supports single channel audio.");
|
||||||
|
assert(num_channels == 1);
|
||||||
|
} else if (filetype == "gsm") {
|
||||||
|
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||||
|
assert(num_channels == 1);
|
||||||
|
assert(sample_rate == 8000);
|
||||||
|
//TORCH_CHECK(num_channels == 1,
|
||||||
|
// "gsm format only supports single channel audio.");
|
||||||
|
//TORCH_CHECK(sample_rate == 8000,
|
||||||
|
// "gsm format only supports a sampling rate of 8kHz.");
|
||||||
|
}
|
||||||
|
const auto signal_info =
|
||||||
|
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
|
||||||
|
const auto encoding_info = get_encodinginfo_for_save(
|
||||||
|
filetype, tensor.dtype(), compression, encoding, bits_per_sample);
|
||||||
|
|
||||||
|
SoxFormat sf(sox_open_write(path.c_str(),
|
||||||
|
&signal_info,
|
||||||
|
&encoding_info,
|
||||||
|
/*filetype=*/filetype.c_str(),
|
||||||
|
/*oob=*/nullptr,
|
||||||
|
/*overwrite_permitted=*/nullptr));
|
||||||
|
|
||||||
|
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Error saving audio file: failed to open file " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
|
||||||
|
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
|
||||||
|
/*output_encoding=*/sf->encoding);
|
||||||
|
chain.addInputTensor(&tensor, sample_rate, channels_first);
|
||||||
|
chain.addOutputFile(sf);
|
||||||
|
chain.run();
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
// helper class to automatically release buffer, to be used by
|
||||||
|
// save_audio_fileobj
|
||||||
|
struct AutoReleaseBuffer {
|
||||||
|
char* ptr;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
AutoReleaseBuffer() : ptr(nullptr), size(0) {}
|
||||||
|
AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
|
||||||
|
AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
|
||||||
|
auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
|
||||||
|
auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
|
||||||
|
~AutoReleaseBuffer() {
|
||||||
|
if (ptr) {
|
||||||
|
free(ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void save_audio_fileobj(
|
||||||
|
py::object fileobj,
|
||||||
|
py::array tensor,
|
||||||
|
int64_t sample_rate,
|
||||||
|
bool channels_first,
|
||||||
|
tl::optional<double> compression,
|
||||||
|
tl::optional<std::string> format,
|
||||||
|
tl::optional<std::string> encoding,
|
||||||
|
tl::optional<int64_t> bits_per_sample) {
|
||||||
|
|
||||||
|
if (!format.has_value()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"`format` is required when saving to file object.");
|
||||||
|
}
|
||||||
|
const auto filetype = format.value();
|
||||||
|
|
||||||
|
if (filetype == "amr-nb") {
|
||||||
|
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||||
|
if (num_channels != 1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"amr-nb format only supports single channel audio.");
|
||||||
|
}
|
||||||
|
} else if (filetype == "htk") {
|
||||||
|
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||||
|
if (num_channels != 1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"htk format only supports single channel audio.");
|
||||||
|
}
|
||||||
|
} else if (filetype == "gsm") {
|
||||||
|
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||||
|
if (num_channels != 1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"gsm format only supports single channel audio.");
|
||||||
|
}
|
||||||
|
if (sample_rate != 8000) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"gsm format only supports a sampling rate of 8kHz.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto signal_info =
|
||||||
|
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
|
||||||
|
const auto encoding_info = get_encodinginfo_for_save(
|
||||||
|
filetype,
|
||||||
|
tensor.dtype(),
|
||||||
|
compression,
|
||||||
|
std::move(encoding),
|
||||||
|
bits_per_sample);
|
||||||
|
|
||||||
|
AutoReleaseBuffer buffer;
|
||||||
|
|
||||||
|
SoxFormat sf(sox_open_memstream_write(
|
||||||
|
&buffer.ptr,
|
||||||
|
&buffer.size,
|
||||||
|
&signal_info,
|
||||||
|
&encoding_info,
|
||||||
|
filetype.c_str(),
|
||||||
|
/*oob=*/nullptr));
|
||||||
|
|
||||||
|
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Error saving audio file: failed to open memory stream.");
|
||||||
|
}
|
||||||
|
|
||||||
|
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
|
||||||
|
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
|
||||||
|
/*output_encoding=*/sf->encoding);
|
||||||
|
chain.addInputTensor(&tensor, sample_rate, channels_first);
|
||||||
|
chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
|
||||||
|
chain.run();
|
||||||
|
|
||||||
|
// Closing the sox_format_t is necessary for flushing the last chunk to the
|
||||||
|
// buffer
|
||||||
|
sf.close();
|
||||||
|
fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace paddleaudio
|
||||||
|
} // namespace sox_io
|
@ -0,0 +1,61 @@
|
|||||||
|
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.h with modification.
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/sox/utils.h"
|
||||||
|
|
||||||
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace sox_io {
|
||||||
|
|
||||||
|
auto get_info_file(const std::string &path,
|
||||||
|
const tl::optional<std::string> &format)
|
||||||
|
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
|
||||||
|
|
||||||
|
auto get_info_fileobj(py::object fileobj,
|
||||||
|
const tl::optional<std::string> &format)
|
||||||
|
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
|
||||||
|
|
||||||
|
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
|
||||||
|
py::object fileobj,
|
||||||
|
const tl::optional<int64_t>& frame_offset,
|
||||||
|
const tl::optional<int64_t>& num_frames,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
const tl::optional<std::string>& format);
|
||||||
|
|
||||||
|
void save_audio_fileobj(
|
||||||
|
py::object fileobj,
|
||||||
|
py::array tensor,
|
||||||
|
int64_t sample_rate,
|
||||||
|
bool channels_first,
|
||||||
|
tl::optional<double> compression,
|
||||||
|
tl::optional<std::string> format,
|
||||||
|
tl::optional<std::string> encoding,
|
||||||
|
tl::optional<int64_t> bits_per_sample);
|
||||||
|
|
||||||
|
auto get_effects(const tl::optional<int64_t>& frame_offset,
|
||||||
|
const tl::optional<int64_t>& num_frames)
|
||||||
|
-> std::vector<std::vector<std::string>>;
|
||||||
|
|
||||||
|
|
||||||
|
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
|
||||||
|
const std::string& path,
|
||||||
|
const tl::optional<int64_t>& frame_offset,
|
||||||
|
const tl::optional<int64_t>& num_frames,
|
||||||
|
tl::optional<bool> normalize,
|
||||||
|
tl::optional<bool> channels_first,
|
||||||
|
const tl::optional<std::string>& format);
|
||||||
|
|
||||||
|
void save_audio_file(const std::string& path,
|
||||||
|
py::array tensor,
|
||||||
|
int64_t sample_rate,
|
||||||
|
bool channels_first,
|
||||||
|
tl::optional<double> compression,
|
||||||
|
tl::optional<std::string> format,
|
||||||
|
tl::optional<std::string> encoding,
|
||||||
|
tl::optional<int64_t> bits_per_sample);
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace paddleaudio
|
||||||
|
} // namespace sox_io
|
@ -0,0 +1,143 @@
|
|||||||
|
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/sox/types.h"
|
||||||
|
#include <ostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace sox_utils {
|
||||||
|
|
||||||
|
Format get_format_from_string(const std::string& format) {
|
||||||
|
if (format == "wav")
|
||||||
|
return Format::WAV;
|
||||||
|
if (format == "mp3")
|
||||||
|
return Format::MP3;
|
||||||
|
if (format == "flac")
|
||||||
|
return Format::FLAC;
|
||||||
|
if (format == "ogg" || format == "vorbis")
|
||||||
|
return Format::VORBIS;
|
||||||
|
if (format == "amr-nb")
|
||||||
|
return Format::AMR_NB;
|
||||||
|
if (format == "amr-wb")
|
||||||
|
return Format::AMR_WB;
|
||||||
|
if (format == "amb")
|
||||||
|
return Format::AMB;
|
||||||
|
if (format == "sph")
|
||||||
|
return Format::SPHERE;
|
||||||
|
if (format == "htk")
|
||||||
|
return Format::HTK;
|
||||||
|
if (format == "gsm")
|
||||||
|
return Format::GSM;
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Internal Error: unexpected format value: " << format;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string to_string(Encoding v) {
|
||||||
|
switch (v) {
|
||||||
|
case Encoding::UNKNOWN:
|
||||||
|
return "UNKNOWN";
|
||||||
|
case Encoding::PCM_SIGNED:
|
||||||
|
return "PCM_S";
|
||||||
|
case Encoding::PCM_UNSIGNED:
|
||||||
|
return "PCM_U";
|
||||||
|
case Encoding::PCM_FLOAT:
|
||||||
|
return "PCM_F";
|
||||||
|
case Encoding::FLAC:
|
||||||
|
return "FLAC";
|
||||||
|
case Encoding::ULAW:
|
||||||
|
return "ULAW";
|
||||||
|
case Encoding::ALAW:
|
||||||
|
return "ALAW";
|
||||||
|
case Encoding::MP3:
|
||||||
|
return "MP3";
|
||||||
|
case Encoding::VORBIS:
|
||||||
|
return "VORBIS";
|
||||||
|
case Encoding::AMR_WB:
|
||||||
|
return "AMR_WB";
|
||||||
|
case Encoding::AMR_NB:
|
||||||
|
return "AMR_NB";
|
||||||
|
case Encoding::OPUS:
|
||||||
|
return "OPUS";
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Internal Error: unexpected encoding.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
|
||||||
|
if (!encoding.has_value())
|
||||||
|
return Encoding::NOT_PROVIDED;
|
||||||
|
std::string v = encoding.value();
|
||||||
|
if (v == "PCM_S")
|
||||||
|
return Encoding::PCM_SIGNED;
|
||||||
|
if (v == "PCM_U")
|
||||||
|
return Encoding::PCM_UNSIGNED;
|
||||||
|
if (v == "PCM_F")
|
||||||
|
return Encoding::PCM_FLOAT;
|
||||||
|
if (v == "ULAW")
|
||||||
|
return Encoding::ULAW;
|
||||||
|
if (v == "ALAW")
|
||||||
|
return Encoding::ALAW;
|
||||||
|
std::ostringstream stream;
|
||||||
|
stream << "Internal Error: unexpected encoding value: " << v;
|
||||||
|
throw std::runtime_error(stream.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
|
||||||
|
if (!bit_depth.has_value())
|
||||||
|
return BitDepth::NOT_PROVIDED;
|
||||||
|
int64_t v = bit_depth.value();
|
||||||
|
switch (v) {
|
||||||
|
case 8:
|
||||||
|
return BitDepth::B8;
|
||||||
|
case 16:
|
||||||
|
return BitDepth::B16;
|
||||||
|
case 24:
|
||||||
|
return BitDepth::B24;
|
||||||
|
case 32:
|
||||||
|
return BitDepth::B32;
|
||||||
|
case 64:
|
||||||
|
return BitDepth::B64;
|
||||||
|
default: {
|
||||||
|
std::ostringstream s;
|
||||||
|
s << "Internal Error: unexpected bit depth value: " << v;
|
||||||
|
throw std::runtime_error(s.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_encoding(sox_encoding_t encoding) {
|
||||||
|
switch (encoding) {
|
||||||
|
case SOX_ENCODING_UNKNOWN:
|
||||||
|
return "UNKNOWN";
|
||||||
|
case SOX_ENCODING_SIGN2:
|
||||||
|
return "PCM_S";
|
||||||
|
case SOX_ENCODING_UNSIGNED:
|
||||||
|
return "PCM_U";
|
||||||
|
case SOX_ENCODING_FLOAT:
|
||||||
|
return "PCM_F";
|
||||||
|
case SOX_ENCODING_FLAC:
|
||||||
|
return "FLAC";
|
||||||
|
case SOX_ENCODING_ULAW:
|
||||||
|
return "ULAW";
|
||||||
|
case SOX_ENCODING_ALAW:
|
||||||
|
return "ALAW";
|
||||||
|
case SOX_ENCODING_MP3:
|
||||||
|
return "MP3";
|
||||||
|
case SOX_ENCODING_VORBIS:
|
||||||
|
return "VORBIS";
|
||||||
|
case SOX_ENCODING_AMR_WB:
|
||||||
|
return "AMR_WB";
|
||||||
|
case SOX_ENCODING_AMR_NB:
|
||||||
|
return "AMR_NB";
|
||||||
|
case SOX_ENCODING_OPUS:
|
||||||
|
return "OPUS";
|
||||||
|
case SOX_ENCODING_GSM:
|
||||||
|
return "GSM";
|
||||||
|
default:
|
||||||
|
return "UNKNOWN";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sox_utils
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,58 @@
|
|||||||
|
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <sox.h>
|
||||||
|
#include "paddleaudio/src/optional/optional.hpp"
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace sox_utils {
|
||||||
|
|
||||||
|
enum class Format {
|
||||||
|
WAV,
|
||||||
|
MP3,
|
||||||
|
FLAC,
|
||||||
|
VORBIS,
|
||||||
|
AMR_NB,
|
||||||
|
AMR_WB,
|
||||||
|
AMB,
|
||||||
|
SPHERE,
|
||||||
|
GSM,
|
||||||
|
HTK,
|
||||||
|
};
|
||||||
|
|
||||||
|
Format get_format_from_string(const std::string& format);
|
||||||
|
|
||||||
|
enum class Encoding {
|
||||||
|
NOT_PROVIDED,
|
||||||
|
UNKNOWN,
|
||||||
|
PCM_SIGNED,
|
||||||
|
PCM_UNSIGNED,
|
||||||
|
PCM_FLOAT,
|
||||||
|
FLAC,
|
||||||
|
ULAW,
|
||||||
|
ALAW,
|
||||||
|
MP3,
|
||||||
|
VORBIS,
|
||||||
|
AMR_WB,
|
||||||
|
AMR_NB,
|
||||||
|
OPUS,
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string to_string(Encoding v);
|
||||||
|
Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
|
||||||
|
|
||||||
|
enum class BitDepth : unsigned {
|
||||||
|
NOT_PROVIDED = 0,
|
||||||
|
B8 = 8,
|
||||||
|
B16 = 16,
|
||||||
|
B24 = 24,
|
||||||
|
B32 = 32,
|
||||||
|
B64 = 64,
|
||||||
|
};
|
||||||
|
|
||||||
|
BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
|
||||||
|
|
||||||
|
std::string get_encoding(sox_encoding_t encoding);
|
||||||
|
|
||||||
|
} // namespace sox_utils
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,550 @@
|
|||||||
|
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp with modification.
|
||||||
|
#include <sox.h>
|
||||||
|
|
||||||
|
#include "paddleaudio/src/pybind/sox/utils.h"
|
||||||
|
#include "paddleaudio/src/pybind/sox/types.h"
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace sox_utils {
|
||||||
|
|
||||||
|
auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
|
||||||
|
-> uint64_t {
|
||||||
|
uint64_t num_read = 0;
|
||||||
|
while (num_read < size) {
|
||||||
|
auto request = size - num_read;
|
||||||
|
auto chunk = static_cast<std::string>(
|
||||||
|
static_cast<py::bytes>(fileobj->attr("read")(request)));
|
||||||
|
auto chunk_len = chunk.length();
|
||||||
|
if (chunk_len == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (chunk_len > request) {
|
||||||
|
std::ostringstream message;
|
||||||
|
message
|
||||||
|
<< "Requested up to " << request << " bytes but, "
|
||||||
|
<< "received " << chunk_len << " bytes. "
|
||||||
|
<< "The given object does not confirm to read protocol of file "
|
||||||
|
"object.";
|
||||||
|
throw std::runtime_error(message.str());
|
||||||
|
}
|
||||||
|
memcpy(buffer, chunk.data(), chunk_len);
|
||||||
|
buffer += chunk_len;
|
||||||
|
num_read += chunk_len;
|
||||||
|
}
|
||||||
|
return num_read;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void set_seed(const int64_t seed) {
|
||||||
|
sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_verbosity(const int64_t verbosity) {
|
||||||
|
sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_use_threads(const bool use_threads) {
|
||||||
|
sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_buffer_size(const int64_t buffer_size) {
|
||||||
|
sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_buffer_size() {
|
||||||
|
return sox_get_globals()->bufsiz;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::string>> list_effects() {
|
||||||
|
std::vector<std::vector<std::string>> effects;
|
||||||
|
for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
|
||||||
|
const sox_effect_handler_t* handler = (*fns)();
|
||||||
|
if (handler && handler->name) {
|
||||||
|
if (UNSUPPORTED_EFFECTS.find(handler->name) ==
|
||||||
|
UNSUPPORTED_EFFECTS.end()) {
|
||||||
|
effects.emplace_back(std::vector<std::string>{
|
||||||
|
handler->name,
|
||||||
|
handler->usage ? std::string(handler->usage) : std::string("")});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return effects;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> list_write_formats() {
|
||||||
|
std::vector<std::string> formats;
|
||||||
|
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
|
||||||
|
const sox_format_handler_t* handler = fns->fn();
|
||||||
|
for (const char* const* names = handler->names; *names; ++names) {
|
||||||
|
if (!strchr(*names, '/') && handler->write)
|
||||||
|
formats.emplace_back(*names);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return formats;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> list_read_formats() {
|
||||||
|
std::vector<std::string> formats;
|
||||||
|
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
|
||||||
|
const sox_format_handler_t* handler = fns->fn();
|
||||||
|
for (const char* const* names = handler->names; *names; ++names) {
|
||||||
|
if (!strchr(*names, '/') && handler->read)
|
||||||
|
formats.emplace_back(*names);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return formats;
|
||||||
|
}
|
||||||
|
|
||||||
|
SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
|
||||||
|
SoxFormat::~SoxFormat() {
|
||||||
|
close();
|
||||||
|
}
|
||||||
|
|
||||||
|
sox_format_t* SoxFormat::operator->() const noexcept {
|
||||||
|
return fd_;
|
||||||
|
}
|
||||||
|
SoxFormat::operator sox_format_t*() const noexcept {
|
||||||
|
return fd_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SoxFormat::close() {
|
||||||
|
if (fd_ != nullptr) {
|
||||||
|
sox_close(fd_);
|
||||||
|
fd_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void validate_input_file(const SoxFormat& sf, const std::string& path) {
|
||||||
|
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Error loading audio file: failed to open file " + path);
|
||||||
|
}
|
||||||
|
if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||||
|
throw std::runtime_error("Error loading audio file: unknown encoding.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void validate_input_memfile(const SoxFormat &sf) {
|
||||||
|
return validate_input_file(sf, "<in memory buffer>");
|
||||||
|
}
|
||||||
|
|
||||||
|
void validate_input_tensor(const py::array tensor) {
|
||||||
|
if (tensor.ndim() != 2) {
|
||||||
|
throw std::runtime_error("Input tensor has to be 2D.");
|
||||||
|
}
|
||||||
|
|
||||||
|
char dtype = tensor.dtype().char_();
|
||||||
|
bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
|
||||||
|
if (flag == false) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Input tensor has to be one of float32, int32, int16 or uint8 type.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
py::dtype get_dtype(
|
||||||
|
const sox_encoding_t encoding,
|
||||||
|
const unsigned precision) {
|
||||||
|
switch (encoding) {
|
||||||
|
case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
|
||||||
|
return py::dtype('u1');
|
||||||
|
case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
|
||||||
|
switch (precision) {
|
||||||
|
case 16:
|
||||||
|
return py::dtype("i2");
|
||||||
|
case 24: // Cast 24-bit to 32-bit.
|
||||||
|
case 32:
|
||||||
|
return py::dtype('i');
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Only 16, 24, and 32 bits are supported for signed PCM.");
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// default to float32 for the other formats, including
|
||||||
|
// 32-bit flaoting-point WAV,
|
||||||
|
// MP3,
|
||||||
|
// FLAC,
|
||||||
|
// VORBIS etc...
|
||||||
|
return py::dtype("f");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
py::array convert_to_tensor(
|
||||||
|
sox_sample_t* buffer,
|
||||||
|
const int32_t num_samples,
|
||||||
|
const int32_t num_channels,
|
||||||
|
const py::dtype dtype,
|
||||||
|
const bool normalize,
|
||||||
|
const bool channels_first) {
|
||||||
|
// todo refector later(SGoat)
|
||||||
|
py::array t;
|
||||||
|
uint64_t dummy = 0;
|
||||||
|
SOX_SAMPLE_LOCALS;
|
||||||
|
int32_t num_rows = num_samples / num_channels;
|
||||||
|
if (normalize || dtype.char_() == 'f') {
|
||||||
|
t = py::array(dtype, {num_rows, num_channels});
|
||||||
|
auto ptr = (float*)t.mutable_data(0, 0);
|
||||||
|
for (int32_t i = 0; i < num_samples; ++i) {
|
||||||
|
ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
|
||||||
|
}
|
||||||
|
if (channels_first) {
|
||||||
|
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||||
|
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||||
|
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||||
|
*(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx);
|
||||||
|
}
|
||||||
|
return t2;
|
||||||
|
}
|
||||||
|
} else if (dtype.char_() == 'i') {
|
||||||
|
t = py::array(dtype, {num_rows, num_channels});
|
||||||
|
auto ptr = (int*)t.mutable_data(0, 0);
|
||||||
|
for (int32_t i = 0; i < num_samples; ++i) {
|
||||||
|
ptr[i] = buffer[i];
|
||||||
|
}
|
||||||
|
if (channels_first) {
|
||||||
|
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||||
|
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||||
|
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||||
|
*(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx);
|
||||||
|
}
|
||||||
|
return t2;
|
||||||
|
}
|
||||||
|
} else if (dtype.char_() == 'h') { // int16
|
||||||
|
t = py::array(dtype, {num_rows, num_channels});
|
||||||
|
auto ptr = (int16_t*)t.mutable_data(0, 0);
|
||||||
|
for (int32_t i = 0; i < num_samples; ++i) {
|
||||||
|
ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
|
||||||
|
}
|
||||||
|
if (channels_first) {
|
||||||
|
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||||
|
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||||
|
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||||
|
*(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx);
|
||||||
|
}
|
||||||
|
return t2;
|
||||||
|
}
|
||||||
|
} else if (dtype.char_() == 'b') {
|
||||||
|
//t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
|
||||||
|
t = py::array(dtype, {num_rows, num_channels});
|
||||||
|
auto ptr = (uint8_t*)t.mutable_data(0,0);
|
||||||
|
for (int32_t i = 0; i < num_samples; ++i) {
|
||||||
|
ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
|
||||||
|
}
|
||||||
|
if (channels_first) {
|
||||||
|
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||||
|
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||||
|
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||||
|
*(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx);
|
||||||
|
}
|
||||||
|
return t2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Unsupported dtype.");
|
||||||
|
}
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string get_filetype(const std::string path) {
|
||||||
|
std::string ext = path.substr(path.find_last_of(".") + 1);
|
||||||
|
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
|
||||||
|
return ext;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
|
||||||
|
const std::string format,
|
||||||
|
py::dtype dtype,
|
||||||
|
const Encoding& encoding,
|
||||||
|
const BitDepth& bits_per_sample) {
|
||||||
|
switch (encoding) {
|
||||||
|
case Encoding::NOT_PROVIDED:
|
||||||
|
switch (bits_per_sample) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
switch (dtype.num()) {
|
||||||
|
case 11: // float32 numpy dtype num
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
|
||||||
|
case 5: // int numpy dtype num
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
|
||||||
|
case 3: // int16 numpy
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
|
||||||
|
case 1: // byte numpy
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Internal Error: Unexpected dtype.");
|
||||||
|
}
|
||||||
|
case BitDepth::B8:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
|
||||||
|
default:
|
||||||
|
return std::make_tuple<>(
|
||||||
|
SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
|
||||||
|
}
|
||||||
|
case Encoding::PCM_SIGNED:
|
||||||
|
switch (bits_per_sample) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
|
||||||
|
case BitDepth::B8:
|
||||||
|
throw std::runtime_error(
|
||||||
|
format + " does not support 8-bit signed PCM encoding.");
|
||||||
|
default:
|
||||||
|
return std::make_tuple<>(
|
||||||
|
SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
|
||||||
|
}
|
||||||
|
case Encoding::PCM_UNSIGNED:
|
||||||
|
switch (bits_per_sample) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
case BitDepth::B8:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
format + " only supports 8-bit for unsigned PCM encoding.");
|
||||||
|
}
|
||||||
|
case Encoding::PCM_FLOAT:
|
||||||
|
switch (bits_per_sample) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
case BitDepth::B32:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
|
||||||
|
case BitDepth::B64:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
format +
|
||||||
|
" only supports 32-bit or 64-bit for floating-point PCM encoding.");
|
||||||
|
}
|
||||||
|
case Encoding::ULAW:
|
||||||
|
switch (bits_per_sample) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
case BitDepth::B8:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
format + " only supports 8-bit for mu-law encoding.");
|
||||||
|
}
|
||||||
|
case Encoding::ALAW:
|
||||||
|
switch (bits_per_sample) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
case BitDepth::B8:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
format + " only supports 8-bit for a-law encoding.");
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
format + " does not support encoding: " + to_string(encoding));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::tuple<sox_encoding_t, unsigned> get_save_encoding(
|
||||||
|
const std::string& format,
|
||||||
|
const py::dtype dtype,
|
||||||
|
const tl::optional<std::string> encoding,
|
||||||
|
const tl::optional<int64_t> bits_per_sample) {
|
||||||
|
const Format fmt = get_format_from_string(format);
|
||||||
|
const Encoding enc = get_encoding_from_option(encoding);
|
||||||
|
const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
|
||||||
|
|
||||||
|
switch (fmt) {
|
||||||
|
case Format::WAV:
|
||||||
|
case Format::AMB:
|
||||||
|
return get_save_encoding_for_wav(format, dtype, enc, bps);
|
||||||
|
case Format::MP3:
|
||||||
|
if (enc != Encoding::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error("mp3 does not support `encoding` option.");
|
||||||
|
if (bps != BitDepth::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"mp3 does not support `bits_per_sample` option.");
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_MP3, 16);
|
||||||
|
case Format::HTK:
|
||||||
|
if (enc != Encoding::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error("htk does not support `encoding` option.");
|
||||||
|
if (bps != BitDepth::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"htk does not support `bits_per_sample` option.");
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
|
||||||
|
case Format::VORBIS:
|
||||||
|
if (enc != Encoding::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error("vorbis does not support `encoding` option.");
|
||||||
|
if (bps != BitDepth::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"vorbis does not support `bits_per_sample` option.");
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
|
||||||
|
case Format::AMR_NB:
|
||||||
|
if (enc != Encoding::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error("amr-nb does not support `encoding` option.");
|
||||||
|
if (bps != BitDepth::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"amr-nb does not support `bits_per_sample` option.");
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
|
||||||
|
case Format::FLAC:
|
||||||
|
if (enc != Encoding::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error("flac does not support `encoding` option.");
|
||||||
|
switch (bps) {
|
||||||
|
case BitDepth::B32:
|
||||||
|
case BitDepth::B64:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"flac does not support `bits_per_sample` larger than 24.");
|
||||||
|
default:
|
||||||
|
return std::make_tuple<>(
|
||||||
|
SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
|
||||||
|
}
|
||||||
|
case Format::SPHERE:
|
||||||
|
switch (enc) {
|
||||||
|
case Encoding::NOT_PROVIDED:
|
||||||
|
case Encoding::PCM_SIGNED:
|
||||||
|
switch (bps) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
|
||||||
|
default:
|
||||||
|
return std::make_tuple<>(
|
||||||
|
SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
|
||||||
|
}
|
||||||
|
case Encoding::PCM_UNSIGNED:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"sph does not support unsigned integer PCM.");
|
||||||
|
case Encoding::PCM_FLOAT:
|
||||||
|
throw std::runtime_error("sph does not support floating point PCM.");
|
||||||
|
case Encoding::ULAW:
|
||||||
|
switch (bps) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
case BitDepth::B8:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"sph only supports 8-bit for mu-law encoding.");
|
||||||
|
}
|
||||||
|
case Encoding::ALAW:
|
||||||
|
switch (bps) {
|
||||||
|
case BitDepth::NOT_PROVIDED:
|
||||||
|
case BitDepth::B8:
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
|
||||||
|
default:
|
||||||
|
return std::make_tuple<>(
|
||||||
|
SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"sph does not support encoding: " + encoding.value());
|
||||||
|
}
|
||||||
|
case Format::GSM:
|
||||||
|
if (enc != Encoding::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error("gsm does not support `encoding` option.");
|
||||||
|
if (bps != BitDepth::NOT_PROVIDED)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"gsm does not support `bits_per_sample` option.");
|
||||||
|
return std::make_tuple<>(SOX_ENCODING_GSM, 16);
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unsupported format: " + format);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned get_precision(const std::string filetype, py::dtype dtype) {
|
||||||
|
if (filetype == "mp3")
|
||||||
|
return SOX_UNSPEC;
|
||||||
|
if (filetype == "flac")
|
||||||
|
return 24;
|
||||||
|
if (filetype == "ogg" || filetype == "vorbis")
|
||||||
|
return SOX_UNSPEC;
|
||||||
|
if (filetype == "wav" || filetype == "amb") {
|
||||||
|
switch (dtype.num()) {
|
||||||
|
case 1: // byte in numpy dype num
|
||||||
|
return 8;
|
||||||
|
case 3: // short, in numpy dtype num
|
||||||
|
return 16;
|
||||||
|
case 5: // int, numpy dtype
|
||||||
|
return 32;
|
||||||
|
case 11: // float, numpy dtype
|
||||||
|
return 32;
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unsupported dtype.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (filetype == "sph")
|
||||||
|
return 32;
|
||||||
|
if (filetype == "amr-nb") {
|
||||||
|
return 16;
|
||||||
|
}
|
||||||
|
if (filetype == "gsm") {
|
||||||
|
return 16;
|
||||||
|
}
|
||||||
|
if (filetype == "htk") {
|
||||||
|
return 16;
|
||||||
|
}
|
||||||
|
throw std::runtime_error("Unsupported file type: " + filetype);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
sox_signalinfo_t get_signalinfo(
|
||||||
|
const py::array* waveform,
|
||||||
|
const int64_t sample_rate,
|
||||||
|
const std::string filetype,
|
||||||
|
const bool channels_first) {
|
||||||
|
return sox_signalinfo_t{
|
||||||
|
/*rate=*/static_cast<sox_rate_t>(sample_rate),
|
||||||
|
/*channels=*/
|
||||||
|
static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
|
||||||
|
/*precision=*/get_precision(filetype, waveform->dtype()),
|
||||||
|
/*length=*/static_cast<uint64_t>(waveform->size())};
|
||||||
|
}
|
||||||
|
|
||||||
|
sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
|
||||||
|
sox_encoding_t encoding = [&]() {
|
||||||
|
switch (dtype.num()) {
|
||||||
|
case 1: // byte
|
||||||
|
return SOX_ENCODING_UNSIGNED;
|
||||||
|
case 3: // short
|
||||||
|
return SOX_ENCODING_SIGN2;
|
||||||
|
case 5: // int32
|
||||||
|
return SOX_ENCODING_SIGN2;
|
||||||
|
case 11: // float
|
||||||
|
return SOX_ENCODING_FLOAT;
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unsupported dtype.");
|
||||||
|
}
|
||||||
|
}();
|
||||||
|
unsigned bits_per_sample = [&]() {
|
||||||
|
switch (dtype.num()) {
|
||||||
|
case 1: // byte
|
||||||
|
return 8;
|
||||||
|
case 3: //short
|
||||||
|
return 16;
|
||||||
|
case 5: // int32
|
||||||
|
return 32;
|
||||||
|
case 11: // float
|
||||||
|
return 32;
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unsupported dtype.");
|
||||||
|
}
|
||||||
|
}();
|
||||||
|
return sox_encodinginfo_t{
|
||||||
|
/*encoding=*/encoding,
|
||||||
|
/*bits_per_sample=*/bits_per_sample,
|
||||||
|
/*compression=*/HUGE_VAL,
|
||||||
|
/*reverse_bytes=*/sox_option_default,
|
||||||
|
/*reverse_nibbles=*/sox_option_default,
|
||||||
|
/*reverse_bits=*/sox_option_default,
|
||||||
|
/*opposite_endian=*/sox_false};
|
||||||
|
}
|
||||||
|
|
||||||
|
sox_encodinginfo_t get_encodinginfo_for_save(
|
||||||
|
const std::string& format,
|
||||||
|
const py::dtype dtype,
|
||||||
|
const tl::optional<double> compression,
|
||||||
|
const tl::optional<std::string> encoding,
|
||||||
|
const tl::optional<int64_t> bits_per_sample) {
|
||||||
|
auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
|
||||||
|
return sox_encodinginfo_t{
|
||||||
|
/*encoding=*/std::get<0>(enc),
|
||||||
|
/*bits_per_sample=*/std::get<1>(enc),
|
||||||
|
/*compression=*/compression.value_or(HUGE_VAL),
|
||||||
|
/*reverse_bytes=*/sox_option_default,
|
||||||
|
/*reverse_nibbles=*/sox_option_default,
|
||||||
|
/*reverse_bits=*/sox_option_default,
|
||||||
|
/*opposite_endian=*/sox_false};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace paddleaudio
|
||||||
|
} // namespace sox_utils
|
@ -0,0 +1,114 @@
|
|||||||
|
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h with modification.
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <pybind11/pybind11.h>
|
||||||
|
#include <pybind11/numpy.h>
|
||||||
|
#include <sox.h>
|
||||||
|
#include "paddleaudio/src/optional/optional.hpp"
|
||||||
|
|
||||||
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
namespace sox_utils {
|
||||||
|
|
||||||
|
auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;
|
||||||
|
|
||||||
|
void set_seed(const int64_t seed);
|
||||||
|
|
||||||
|
void set_verbosity(const int64_t verbosity);
|
||||||
|
|
||||||
|
void set_use_threads(const bool use_threads);
|
||||||
|
|
||||||
|
void set_buffer_size(const int64_t buffer_size);
|
||||||
|
|
||||||
|
int64_t get_buffer_size();
|
||||||
|
|
||||||
|
std::vector<std::vector<std::string>> list_effects();
|
||||||
|
|
||||||
|
std::vector<std::string> list_read_formats();
|
||||||
|
|
||||||
|
std::vector<std::string> list_write_formats();
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Utilities for sox_io / sox_effects implementations
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
|
||||||
|
{"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
|
||||||
|
|
||||||
|
/// helper class to automatically close sox_format_t*
|
||||||
|
struct SoxFormat {
|
||||||
|
explicit SoxFormat(sox_format_t* fd) noexcept;
|
||||||
|
SoxFormat(const SoxFormat& other) = delete;
|
||||||
|
SoxFormat(SoxFormat&& other) = delete;
|
||||||
|
SoxFormat& operator=(const SoxFormat& other) = delete;
|
||||||
|
SoxFormat& operator=(SoxFormat&& other) = delete;
|
||||||
|
~SoxFormat();
|
||||||
|
sox_format_t* operator->() const noexcept;
|
||||||
|
operator sox_format_t*() const noexcept;
|
||||||
|
|
||||||
|
void close();
|
||||||
|
|
||||||
|
private:
|
||||||
|
sox_format_t* fd_;
|
||||||
|
};
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
|
||||||
|
void validate_input_tensor(const py::array);
|
||||||
|
|
||||||
|
void validate_input_file(const SoxFormat& sf, const std::string& path);
|
||||||
|
|
||||||
|
void validate_input_memfile(const SoxFormat &sf);
|
||||||
|
///
|
||||||
|
/// Get target dtype for the given encoding and precision.
|
||||||
|
py::dtype get_dtype(
|
||||||
|
const sox_encoding_t encoding,
|
||||||
|
const unsigned precision);
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
|
||||||
|
/// NOTE: This function might modify the values in the input buffer to
|
||||||
|
/// reduce the number of memory copy.
|
||||||
|
/// @param buffer Pointer to buffer that contains audio data.
|
||||||
|
/// @param num_samples The number of samples to read.
|
||||||
|
/// @param num_channels The number of channels. Used to reshape the resulting
|
||||||
|
/// Tensor.
|
||||||
|
/// @param dtype Target dtype. Determines the output dtype and value range in
|
||||||
|
/// conjunction with normalization.
|
||||||
|
/// @param noramlize Perform normalization. Only effective when dtype is not
|
||||||
|
/// kFloat32. When effective, the output tensor is kFloat32 type and value range
|
||||||
|
/// is [-1.0, 1.0]
|
||||||
|
/// @param channels_first When True, output Tensor has shape of [num_channels,
|
||||||
|
/// num_frames].
|
||||||
|
py::array convert_to_tensor(
|
||||||
|
sox_sample_t* buffer,
|
||||||
|
const int32_t num_samples,
|
||||||
|
const int32_t num_channels,
|
||||||
|
const py::dtype dtype,
|
||||||
|
const bool normalize,
|
||||||
|
const bool channels_first);
|
||||||
|
|
||||||
|
/// Extract extension from file path
|
||||||
|
const std::string get_filetype(const std::string path);
|
||||||
|
|
||||||
|
/// Get sox_signalinfo_t for passing a py::array object.
|
||||||
|
sox_signalinfo_t get_signalinfo(
|
||||||
|
const py::array* waveform,
|
||||||
|
const int64_t sample_rate,
|
||||||
|
const std::string filetype,
|
||||||
|
const bool channels_first);
|
||||||
|
|
||||||
|
/// Get sox_encodinginfo_t for Tensor I/O
|
||||||
|
sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
|
||||||
|
|
||||||
|
/// Get sox_encodinginfo_t for saving to file/file object
|
||||||
|
sox_encodinginfo_t get_encodinginfo_for_save(
|
||||||
|
const std::string& format,
|
||||||
|
const py::dtype dtype,
|
||||||
|
const tl::optional<double> compression,
|
||||||
|
const tl::optional<std::string> encoding,
|
||||||
|
const tl::optional<int64_t> bits_per_sample);
|
||||||
|
|
||||||
|
} // namespace paddleaudio
|
||||||
|
} // namespace sox_utils
|
@ -0,0 +1,35 @@
|
|||||||
|
// this is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/utils.cpp with modification.
|
||||||
|
|
||||||
|
namespace paddleaudio {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
bool is_sox_available() {
|
||||||
|
#ifdef INCLUDE_SOX
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_kaldi_available() {
|
||||||
|
#ifdef INCLUDE_KALDI
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// It tells whether paddleaudio was compiled with ffmpeg
|
||||||
|
// not the runtime availability.
|
||||||
|
bool is_ffmpeg_available() {
|
||||||
|
#ifdef USE_FFMPEG
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
} // namespace paddleaudio
|
@ -0,0 +1,2 @@
|
|||||||
|
archives/
|
||||||
|
install/
|
@ -0,0 +1,15 @@
|
|||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# sox
|
||||||
|
################################################################################
|
||||||
|
if (BUILD_SOX)
|
||||||
|
add_subdirectory(sox)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# kaldi
|
||||||
|
################################################################################
|
||||||
|
if (BUILD_KALDI)
|
||||||
|
add_subdirectory(kaldi)
|
||||||
|
endif()
|
@ -0,0 +1,111 @@
|
|||||||
|
# checkout the thirdparty/kaldi/base/kaldi-types.h
|
||||||
|
# compile kaldi without openfst
|
||||||
|
add_definitions("-DCOMPILE_WITHOUT_OPENFST")
|
||||||
|
|
||||||
|
if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
|
||||||
|
file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||||
|
file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||||
|
file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||||
|
file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# kaldi-base
|
||||||
|
add_library(kaldi-base STATIC
|
||||||
|
base/io-funcs.cc
|
||||||
|
base/kaldi-error.cc
|
||||||
|
base/kaldi-math.cc
|
||||||
|
base/kaldi-utils.cc
|
||||||
|
base/timer.cc
|
||||||
|
)
|
||||||
|
target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
|
# kaldi-matrix
|
||||||
|
add_library(kaldi-matrix STATIC
|
||||||
|
matrix/compressed-matrix.cc
|
||||||
|
matrix/matrix-functions.cc
|
||||||
|
matrix/kaldi-matrix.cc
|
||||||
|
matrix/kaldi-vector.cc
|
||||||
|
matrix/optimization.cc
|
||||||
|
matrix/packed-matrix.cc
|
||||||
|
matrix/qr.cc
|
||||||
|
matrix/sparse-matrix.cc
|
||||||
|
matrix/sp-matrix.cc
|
||||||
|
matrix/srfft.cc
|
||||||
|
matrix/tp-matrix.cc
|
||||||
|
)
|
||||||
|
target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
|
||||||
|
else()
|
||||||
|
target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# kaldi-util
|
||||||
|
add_library(kaldi-util STATIC
|
||||||
|
util/kaldi-holder.cc
|
||||||
|
util/kaldi-io.cc
|
||||||
|
util/kaldi-semaphore.cc
|
||||||
|
util/kaldi-table.cc
|
||||||
|
util/kaldi-thread.cc
|
||||||
|
util/parse-options.cc
|
||||||
|
util/simple-io-funcs.cc
|
||||||
|
util/simple-options.cc
|
||||||
|
util/text-utils.cc
|
||||||
|
)
|
||||||
|
target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
|
||||||
|
|
||||||
|
# kaldi-feat-common
|
||||||
|
add_library(kaldi-feat-common STATIC
|
||||||
|
feat/cmvn.cc
|
||||||
|
feat/feature-functions.cc
|
||||||
|
feat/feature-window.cc
|
||||||
|
feat/mel-computations.cc
|
||||||
|
feat/pitch-functions.cc
|
||||||
|
feat/resample.cc
|
||||||
|
feat/signal.cc
|
||||||
|
feat/wave-reader.cc
|
||||||
|
)
|
||||||
|
target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
|
||||||
|
|
||||||
|
|
||||||
|
# kaldi-mfcc
|
||||||
|
add_library(kaldi-mfcc STATIC
|
||||||
|
feat/feature-mfcc.cc
|
||||||
|
)
|
||||||
|
target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
|
||||||
|
|
||||||
|
|
||||||
|
# kaldi-fbank
|
||||||
|
add_library(kaldi-fbank STATIC
|
||||||
|
feat/feature-fbank.cc
|
||||||
|
)
|
||||||
|
target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
|
||||||
|
|
||||||
|
|
||||||
|
set(KALDI_LIBRARIES
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(libkaldi INTERFACE)
|
||||||
|
add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
|
||||||
|
target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
|
||||||
|
elseif (MSVC)
|
||||||
|
target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
|
||||||
|
else()
|
||||||
|
target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,86 @@
|
|||||||
|
See the followings for the origin of this patch
|
||||||
|
http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
|
||||||
|
http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
|
||||||
|
--- src/libmad/configure 2004-02-05 09:34:07.000000000 +0000
|
||||||
|
+++ src/libmad/configure.new 2020-06-30 21:10:28.528018931 +0000
|
||||||
|
@@ -19083,71 +19083,7 @@
|
||||||
|
|
||||||
|
if test "$GCC" = yes
|
||||||
|
then
|
||||||
|
- if test -z "$arch"
|
||||||
|
- then
|
||||||
|
- case "$host" in
|
||||||
|
- i386-*) ;;
|
||||||
|
- i?86-*) arch="-march=i486" ;;
|
||||||
|
- arm*-empeg-*) arch="-march=armv4 -mtune=strongarm1100" ;;
|
||||||
|
- armv4*-*) arch="-march=armv4 -mtune=strongarm" ;;
|
||||||
|
- powerpc-*) ;;
|
||||||
|
- mips*-agenda-*) arch="-mcpu=vr4100" ;;
|
||||||
|
- mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
|
||||||
|
- esac
|
||||||
|
- fi
|
||||||
|
-
|
||||||
|
- case "$optimize" in
|
||||||
|
- -O|"-O "*)
|
||||||
|
- optimize="-O"
|
||||||
|
- optimize="$optimize -fforce-mem"
|
||||||
|
- optimize="$optimize -fforce-addr"
|
||||||
|
- : #x optimize="$optimize -finline-functions"
|
||||||
|
- : #- optimize="$optimize -fstrength-reduce"
|
||||||
|
- optimize="$optimize -fthread-jumps"
|
||||||
|
- optimize="$optimize -fcse-follow-jumps"
|
||||||
|
- optimize="$optimize -fcse-skip-blocks"
|
||||||
|
- : #x optimize="$optimize -frerun-cse-after-loop"
|
||||||
|
- : #x optimize="$optimize -frerun-loop-opt"
|
||||||
|
- : #x optimize="$optimize -fgcse"
|
||||||
|
- optimize="$optimize -fexpensive-optimizations"
|
||||||
|
- optimize="$optimize -fregmove"
|
||||||
|
- : #* optimize="$optimize -fdelayed-branch"
|
||||||
|
- : #x optimize="$optimize -fschedule-insns"
|
||||||
|
- optimize="$optimize -fschedule-insns2"
|
||||||
|
- : #? optimize="$optimize -ffunction-sections"
|
||||||
|
- : #? optimize="$optimize -fcaller-saves"
|
||||||
|
- : #> optimize="$optimize -funroll-loops"
|
||||||
|
- : #> optimize="$optimize -funroll-all-loops"
|
||||||
|
- : #x optimize="$optimize -fmove-all-movables"
|
||||||
|
- : #x optimize="$optimize -freduce-all-givs"
|
||||||
|
- : #? optimize="$optimize -fstrict-aliasing"
|
||||||
|
- : #* optimize="$optimize -fstructure-noalias"
|
||||||
|
-
|
||||||
|
- case "$host" in
|
||||||
|
- arm*-*)
|
||||||
|
- optimize="$optimize -fstrength-reduce"
|
||||||
|
- ;;
|
||||||
|
- mips*-*)
|
||||||
|
- optimize="$optimize -fstrength-reduce"
|
||||||
|
- optimize="$optimize -finline-functions"
|
||||||
|
- ;;
|
||||||
|
- i?86-*)
|
||||||
|
- optimize="$optimize -fstrength-reduce"
|
||||||
|
- ;;
|
||||||
|
- powerpc-apple-*)
|
||||||
|
- # this triggers an internal compiler error with gcc2
|
||||||
|
- : #optimize="$optimize -fstrength-reduce"
|
||||||
|
-
|
||||||
|
- # this is really only beneficial with gcc3
|
||||||
|
- : #optimize="$optimize -finline-functions"
|
||||||
|
- ;;
|
||||||
|
- *)
|
||||||
|
- # this sometimes provokes bugs in gcc 2.95.2
|
||||||
|
- : #optimize="$optimize -fstrength-reduce"
|
||||||
|
- ;;
|
||||||
|
- esac
|
||||||
|
- ;;
|
||||||
|
- esac
|
||||||
|
+ optimize="-O2"
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$host" in
|
||||||
|
@@ -21497,6 +21433,7 @@
|
||||||
|
then
|
||||||
|
case "$host" in
|
||||||
|
i?86-*) FPM="INTEL" ;;
|
||||||
|
+ x86_64*) FPM="64BIT" ;;
|
||||||
|
arm*-*) FPM="ARM" ;;
|
||||||
|
mips*-*) FPM="MIPS" ;;
|
||||||
|
sparc*-*) FPM="SPARC" ;;
|
@ -0,0 +1,16 @@
|
|||||||
|
See https://github.com/pytorch/audio/pull/1297
|
||||||
|
diff -ru sox/src/formats.c sox/src/formats.c
|
||||||
|
--- sox/src/formats.c 2014-10-26 19:55:50.000000000 -0700
|
||||||
|
+++ sox/src/formats.c 2021-02-22 16:01:02.833144070 -0800
|
||||||
|
@@ -333,6 +333,10 @@
|
||||||
|
assert(ft);
|
||||||
|
if (!ft->fp)
|
||||||
|
return sox_false;
|
||||||
|
- fstat(fileno((FILE*)ft->fp), &st);
|
||||||
|
+ int fd = fileno((FILE*)ft->fp);
|
||||||
|
+ if (fd < 0)
|
||||||
|
+ return sox_false;
|
||||||
|
+ if (fstat(fd, &st) < 0)
|
||||||
|
+ return sox_false;
|
||||||
|
return ((st.st_mode & S_IFMT) == S_IFREG);
|
||||||
|
}
|
@ -0,0 +1,254 @@
|
|||||||
|
find_package(PkgConfig REQUIRED)
|
||||||
|
|
||||||
|
include(ExternalProject)
|
||||||
|
|
||||||
|
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
|
||||||
|
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
|
||||||
|
set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches)
|
||||||
|
set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
|
||||||
|
|
||||||
|
# To pass custom environment variables to ExternalProject_Add command,
|
||||||
|
# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
|
||||||
|
# https://stackoverflow.com/a/62437353
|
||||||
|
# We constrcut the custom environment variables here
|
||||||
|
set(envs
|
||||||
|
"PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
|
||||||
|
"LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
|
||||||
|
"CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if (BUILD_MAD)
|
||||||
|
ExternalProject_Add(mad
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
|
||||||
|
URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
|
||||||
|
PATCH_COMMAND patch < ${patch_dir}/libmad.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/mad/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/mad/configure ${COMMON_ARGS}
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
endif (BUILD_MAD)
|
||||||
|
|
||||||
|
ExternalProject_Add(amr
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.5.tar.gz
|
||||||
|
URL_HASH SHA256=2c006cb9d5f651bfb5e60156dbff6af3c9d35c7bbcc9015308c0aff1e14cd341
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/amr/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/amr/configure ${COMMON_ARGS}
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternalProject_Add(lame
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
|
||||||
|
URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/lame/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lame/configure ${COMMON_ARGS} --enable-nasm
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternalProject_Add(ogg
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
|
||||||
|
URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/configure ${COMMON_ARGS}
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternalProject_Add(flac
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DEPENDS ogg
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
|
||||||
|
URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternalProject_Add(vorbis
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DEPENDS ogg
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
|
||||||
|
URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/configure ${COMMON_ARGS} --with-ogg
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternalProject_Add(opus
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DEPENDS ogg
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
|
||||||
|
URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opus/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opus/configure ${COMMON_ARGS} --with-ogg
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternalProject_Add(opusfile
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DEPENDS opus
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
|
||||||
|
URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
|
||||||
|
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
# OpenMP is by default compiled against GNU OpenMP, which conflicts with the version of OpenMP that PyTorch uses.
|
||||||
|
# See https://github.com/pytorch/audio/pull/1026
|
||||||
|
# TODO: Add flags like https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py
|
||||||
|
set(SOX_OPTIONS
|
||||||
|
--disable-openmp
|
||||||
|
--with-amrnb
|
||||||
|
--with-amrwb
|
||||||
|
--with-flac
|
||||||
|
--with-lame
|
||||||
|
--with-oggvorbis
|
||||||
|
--with-opus
|
||||||
|
--without-alsa
|
||||||
|
--without-ao
|
||||||
|
--without-coreaudio
|
||||||
|
--without-oss
|
||||||
|
--without-id3tag
|
||||||
|
--without-ladspa
|
||||||
|
--without-magic
|
||||||
|
--without-png
|
||||||
|
--without-pulseaudio
|
||||||
|
--without-sndfile
|
||||||
|
--without-sndio
|
||||||
|
--without-sunaudio
|
||||||
|
--without-waveaudio
|
||||||
|
--without-wavpack
|
||||||
|
--without-twolame
|
||||||
|
)
|
||||||
|
|
||||||
|
set(SOX_LIBRARIES
|
||||||
|
${INSTALL_DIR}/lib/libsox.a
|
||||||
|
${INSTALL_DIR}/lib/libopencore-amrnb.a
|
||||||
|
${INSTALL_DIR}/lib/libopencore-amrwb.a
|
||||||
|
${INSTALL_DIR}/lib/libmp3lame.a
|
||||||
|
${INSTALL_DIR}/lib/libFLAC.a
|
||||||
|
${INSTALL_DIR}/lib/libopusfile.a
|
||||||
|
${INSTALL_DIR}/lib/libopus.a
|
||||||
|
${INSTALL_DIR}/lib/libvorbisenc.a
|
||||||
|
${INSTALL_DIR}/lib/libvorbisfile.a
|
||||||
|
${INSTALL_DIR}/lib/libvorbis.a
|
||||||
|
${INSTALL_DIR}/lib/libogg.a
|
||||||
|
)
|
||||||
|
|
||||||
|
set(sox_depends
|
||||||
|
ogg flac vorbis opusfile lame amr
|
||||||
|
)
|
||||||
|
|
||||||
|
if (BUILD_MAD)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
SOX_OPTIONS
|
||||||
|
--with-mad
|
||||||
|
)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
SOX_LIBRARIES
|
||||||
|
${INSTALL_DIR}/lib/libmad.a
|
||||||
|
)
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
sox_depends
|
||||||
|
mad
|
||||||
|
)
|
||||||
|
else ()
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
SOX_OPTIONS
|
||||||
|
--without-mad
|
||||||
|
)
|
||||||
|
endif (BUILD_MAD)
|
||||||
|
|
||||||
|
ExternalProject_Add(sox
|
||||||
|
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
|
DEPENDS ${sox_depends}
|
||||||
|
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||||
|
URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
|
||||||
|
URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
|
||||||
|
PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
|
||||||
|
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
|
||||||
|
BUILD_BYPRODUCTS ${SOX_LIBRARIES}
|
||||||
|
DOWNLOAD_NO_PROGRESS ON
|
||||||
|
LOG_DOWNLOAD ON
|
||||||
|
LOG_UPDATE ON
|
||||||
|
LOG_CONFIGURE ON
|
||||||
|
LOG_BUILD ON
|
||||||
|
LOG_INSTALL ON
|
||||||
|
LOG_MERGED_STDOUTERR ON
|
||||||
|
LOG_OUTPUT_ON_FAILURE ON
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(libsox INTERFACE)
|
||||||
|
add_dependencies(libsox sox)
|
||||||
|
target_include_directories(libsox INTERFACE ${INSTALL_DIR}/include)
|
||||||
|
target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES})
|
@ -0,0 +1,27 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .download import decompress
|
||||||
|
from .download import download_and_decompress
|
||||||
|
from .download import load_state_dict_from_url
|
||||||
|
from .env import DATA_HOME
|
||||||
|
from .env import MODEL_HOME
|
||||||
|
from .env import PPAUDIO_HOME
|
||||||
|
from .env import USER_HOME
|
||||||
|
from .error import ParameterError
|
||||||
|
from .log import Logger
|
||||||
|
from .log import logger
|
||||||
|
from .numeric import depth_convert
|
||||||
|
from .numeric import pcm16to32
|
||||||
|
from .time import seconds_to_hms
|
||||||
|
from .time import Timer
|
@ -0,0 +1,64 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from paddle.framework import load as load_state_dict
|
||||||
|
from paddle.utils import download
|
||||||
|
|
||||||
|
from .log import logger
|
||||||
|
|
||||||
|
download.logger = logger
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'decompress',
|
||||||
|
'download_and_decompress',
|
||||||
|
'load_state_dict_from_url',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def decompress(file: str):
|
||||||
|
"""
|
||||||
|
Extracts all files from a compressed file.
|
||||||
|
"""
|
||||||
|
assert os.path.isfile(file), "File: {} not exists.".format(file)
|
||||||
|
download._decompress(file)
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_decompress(archives: List[Dict[str, str]],
|
||||||
|
path: str,
|
||||||
|
decompress: bool=True):
|
||||||
|
"""
|
||||||
|
Download archieves and decompress to specific path.
|
||||||
|
"""
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
for archive in archives:
|
||||||
|
assert 'url' in archive and 'md5' in archive, \
|
||||||
|
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
|
||||||
|
download.get_path_from_url(
|
||||||
|
archive['url'], path, archive['md5'], decompress=decompress)
|
||||||
|
|
||||||
|
|
||||||
|
def load_state_dict_from_url(url: str, path: str, md5: str=None):
|
||||||
|
"""
|
||||||
|
Download and load a state dict from url
|
||||||
|
"""
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
download.get_path_from_url(url, path, md5)
|
||||||
|
return load_state_dict(os.path.join(path, os.path.basename(url)))
|
@ -0,0 +1,60 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
'''
|
||||||
|
This module is used to store environmental variables in PaddleAudio.
|
||||||
|
PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
|
||||||
|
├ default value through the PPAUDIO_HOME environment variable.
|
||||||
|
├─ MODEL_HOME --> Store model files.
|
||||||
|
└─ DATA_HOME --> Store automatically downloaded datasets.
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'USER_HOME',
|
||||||
|
'PPAUDIO_HOME',
|
||||||
|
'MODEL_HOME',
|
||||||
|
'DATA_HOME',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_user_home():
|
||||||
|
return os.path.expanduser('~')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ppaudio_home():
|
||||||
|
if 'PPAUDIO_HOME' in os.environ:
|
||||||
|
home_path = os.environ['PPAUDIO_HOME']
|
||||||
|
if os.path.exists(home_path):
|
||||||
|
if os.path.isdir(home_path):
|
||||||
|
return home_path
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
'The environment variable PPAUDIO_HOME {} is not a directory.'.
|
||||||
|
format(home_path))
|
||||||
|
else:
|
||||||
|
return home_path
|
||||||
|
return os.path.join(_get_user_home(), '.paddleaudio')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_sub_home(directory):
|
||||||
|
home = os.path.join(_get_ppaudio_home(), directory)
|
||||||
|
if not os.path.exists(home):
|
||||||
|
os.makedirs(home)
|
||||||
|
return home
|
||||||
|
|
||||||
|
|
||||||
|
USER_HOME = _get_user_home()
|
||||||
|
PPAUDIO_HOME = _get_ppaudio_home()
|
||||||
|
MODEL_HOME = _get_sub_home('models')
|
||||||
|
DATA_HOME = _get_sub_home('datasets')
|
@ -0,0 +1,139 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import contextlib
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
import colorlog
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Logger',
|
||||||
|
'logger',
|
||||||
|
]
|
||||||
|
|
||||||
|
log_config = {
|
||||||
|
'DEBUG': {
|
||||||
|
'level': 10,
|
||||||
|
'color': 'purple'
|
||||||
|
},
|
||||||
|
'INFO': {
|
||||||
|
'level': 20,
|
||||||
|
'color': 'green'
|
||||||
|
},
|
||||||
|
'TRAIN': {
|
||||||
|
'level': 21,
|
||||||
|
'color': 'cyan'
|
||||||
|
},
|
||||||
|
'EVAL': {
|
||||||
|
'level': 22,
|
||||||
|
'color': 'blue'
|
||||||
|
},
|
||||||
|
'WARNING': {
|
||||||
|
'level': 30,
|
||||||
|
'color': 'yellow'
|
||||||
|
},
|
||||||
|
'ERROR': {
|
||||||
|
'level': 40,
|
||||||
|
'color': 'red'
|
||||||
|
},
|
||||||
|
'CRITICAL': {
|
||||||
|
'level': 50,
|
||||||
|
'color': 'bold_red'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Logger(object):
|
||||||
|
'''
|
||||||
|
Deafult logger in PaddleAudio
|
||||||
|
Args:
|
||||||
|
name(str) : Logger name, default is 'PaddleAudio'
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, name: str=None):
|
||||||
|
name = 'PaddleAudio' if not name else name
|
||||||
|
self.logger = logging.getLogger(name)
|
||||||
|
|
||||||
|
for key, conf in log_config.items():
|
||||||
|
logging.addLevelName(conf['level'], key)
|
||||||
|
self.__dict__[key] = functools.partial(self.__call__, conf['level'])
|
||||||
|
self.__dict__[key.lower()] = functools.partial(self.__call__,
|
||||||
|
conf['level'])
|
||||||
|
|
||||||
|
self.format = colorlog.ColoredFormatter(
|
||||||
|
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
|
||||||
|
log_colors={key: conf['color']
|
||||||
|
for key, conf in log_config.items()})
|
||||||
|
|
||||||
|
self.handler = logging.StreamHandler()
|
||||||
|
self.handler.setFormatter(self.format)
|
||||||
|
|
||||||
|
self.logger.addHandler(self.handler)
|
||||||
|
self.logLevel = 'DEBUG'
|
||||||
|
self.logger.setLevel(logging.DEBUG)
|
||||||
|
self.logger.propagate = False
|
||||||
|
self._is_enable = True
|
||||||
|
|
||||||
|
def disable(self):
|
||||||
|
self._is_enable = False
|
||||||
|
|
||||||
|
def enable(self):
|
||||||
|
self._is_enable = True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_enable(self) -> bool:
|
||||||
|
return self._is_enable
|
||||||
|
|
||||||
|
def __call__(self, log_level: str, msg: str):
|
||||||
|
if not self.is_enable:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.log(log_level, msg)
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def use_terminator(self, terminator: str):
|
||||||
|
old_terminator = self.handler.terminator
|
||||||
|
self.handler.terminator = terminator
|
||||||
|
yield
|
||||||
|
self.handler.terminator = old_terminator
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def processing(self, msg: str, interval: float=0.1):
|
||||||
|
'''
|
||||||
|
Continuously print a progress bar with rotating special effects.
|
||||||
|
Args:
|
||||||
|
msg(str): Message to be printed.
|
||||||
|
interval(float): Rotation interval. Default to 0.1.
|
||||||
|
'''
|
||||||
|
end = False
|
||||||
|
|
||||||
|
def _printer():
|
||||||
|
index = 0
|
||||||
|
flags = ['\\', '|', '/', '-']
|
||||||
|
while not end:
|
||||||
|
flag = flags[index % len(flags)]
|
||||||
|
with self.use_terminator('\r'):
|
||||||
|
self.info('{}: {}'.format(msg, flag))
|
||||||
|
time.sleep(interval)
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
t = threading.Thread(target=_printer)
|
||||||
|
t.start()
|
||||||
|
yield
|
||||||
|
end = True
|
||||||
|
|
||||||
|
|
||||||
|
logger = Logger()
|
@ -0,0 +1,107 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
__all__ = ["pcm16to32", "depth_convert"]
|
||||||
|
|
||||||
|
|
||||||
|
def pcm16to32(audio: np.ndarray) -> np.ndarray:
|
||||||
|
"""pcm int16 to float32
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio (np.ndarray): Waveform with dtype of int16.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Waveform with dtype of float32.
|
||||||
|
"""
|
||||||
|
if audio.dtype == np.int16:
|
||||||
|
audio = audio.astype("float32")
|
||||||
|
bits = np.iinfo(np.int16).bits
|
||||||
|
audio = audio / (2**(bits - 1))
|
||||||
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
|
||||||
|
"""Data type casting in a safe way, i.e., prevent overflow or underflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
dtype (Union[type, str]): Data type of waveform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` after safe casting.
|
||||||
|
"""
|
||||||
|
if 'float' in str(y.dtype):
|
||||||
|
return np.clip(y, np.finfo(dtype).min,
|
||||||
|
np.finfo(dtype).max).astype(dtype)
|
||||||
|
else:
|
||||||
|
return np.clip(y, np.iinfo(dtype).min,
|
||||||
|
np.iinfo(dtype).max).astype(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
|
||||||
|
"""Convert audio array to target dtype safely.
|
||||||
|
This function convert audio waveform to a target dtype, with addition steps of
|
||||||
|
preventing overflow/underflow and preserving audio range.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
dtype (Union[type, str]): Data type of waveform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` after safe casting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
|
||||||
|
if y.dtype not in SUPPORT_DTYPE:
|
||||||
|
raise ParameterError(
|
||||||
|
'Unsupported audio dtype, '
|
||||||
|
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
|
||||||
|
|
||||||
|
if dtype not in SUPPORT_DTYPE:
|
||||||
|
raise ParameterError(
|
||||||
|
'Unsupported audio dtype, '
|
||||||
|
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
|
||||||
|
|
||||||
|
if dtype == y.dtype:
|
||||||
|
return y
|
||||||
|
|
||||||
|
if dtype == 'float64' and y.dtype == 'float32':
|
||||||
|
return _safe_cast(y, dtype)
|
||||||
|
if dtype == 'float32' and y.dtype == 'float64':
|
||||||
|
return _safe_cast(y, dtype)
|
||||||
|
|
||||||
|
if dtype == 'int16' or dtype == 'int8':
|
||||||
|
if y.dtype in ['float64', 'float32']:
|
||||||
|
factor = np.iinfo(dtype).max
|
||||||
|
y = np.clip(y * factor, np.iinfo(dtype).min,
|
||||||
|
np.iinfo(dtype).max).astype(dtype)
|
||||||
|
y = y.astype(dtype)
|
||||||
|
else:
|
||||||
|
if dtype == 'int16' and y.dtype == 'int8':
|
||||||
|
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
|
||||||
|
y = y.astype('float32') * factor
|
||||||
|
y = y.astype('int16')
|
||||||
|
|
||||||
|
else: # dtype == 'int8' and y.dtype=='int16':
|
||||||
|
y = y.astype('int32') * np.iinfo('int8').max / \
|
||||||
|
np.iinfo('int16').max
|
||||||
|
y = y.astype('int8')
|
||||||
|
|
||||||
|
if dtype in ['float32', 'float64']:
|
||||||
|
org_dtype = y.dtype
|
||||||
|
y = y.astype(dtype) / np.iinfo(org_dtype).max
|
||||||
|
return y
|
@ -0,0 +1,103 @@
|
|||||||
|
from typing import Dict
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import paddleaudio
|
||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def set_seed(seed: int):
|
||||||
|
"""Set libsox's PRNG
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seed (int): seed value. valid range is int32.
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
http://sox.sourceforge.net/sox.html
|
||||||
|
"""
|
||||||
|
paddleaudio._paddleaudio.sox_utils_set_seed(seed)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def set_verbosity(verbosity: int):
|
||||||
|
"""Set libsox's verbosity
|
||||||
|
|
||||||
|
Args:
|
||||||
|
verbosity (int): Set verbosity level of libsox.
|
||||||
|
|
||||||
|
* ``1`` failure messages
|
||||||
|
* ``2`` warnings
|
||||||
|
* ``3`` details of processing
|
||||||
|
* ``4``-``6`` increasing levels of debug messages
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
http://sox.sourceforge.net/sox.html
|
||||||
|
"""
|
||||||
|
paddleaudio._paddleaudio.sox_utils_set_verbosity(verbosity)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def set_buffer_size(buffer_size: int):
|
||||||
|
"""Set buffer size for sox effect chain
|
||||||
|
|
||||||
|
Args:
|
||||||
|
buffer_size (int): Set the size in bytes of the buffers used for processing audio.
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
http://sox.sourceforge.net/sox.html
|
||||||
|
"""
|
||||||
|
paddleaudio._paddleaudio.sox_utils_set_buffer_size(buffer_size)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def set_use_threads(use_threads: bool):
|
||||||
|
"""Set multithread option for sox effect chain
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
|
||||||
|
To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
http://sox.sourceforge.net/sox.html
|
||||||
|
"""
|
||||||
|
paddleaudio._paddleaudio.sox_utils_set_use_threads(use_threads)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def list_effects() -> Dict[str, str]:
|
||||||
|
"""List the available sox effect names
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, str]: Mapping from ``effect name`` to ``usage``
|
||||||
|
"""
|
||||||
|
return dict(paddleaudio._paddleaudio.sox_utils_list_effects())
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def list_read_formats() -> List[str]:
|
||||||
|
"""List the supported audio formats for read
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: List of supported audio formats
|
||||||
|
"""
|
||||||
|
return paddleaudio._paddleaudio.sox_utils_list_read_formats()
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def list_write_formats() -> List[str]:
|
||||||
|
"""List the supported audio formats for write
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: List of supported audio formats
|
||||||
|
"""
|
||||||
|
return paddleaudio._paddleaudio.sox_utils_list_write_formats()
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def get_buffer_size() -> int:
|
||||||
|
"""Get buffer size for sox effect chain
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: size in bytes of buffers used for processing audio.
|
||||||
|
"""
|
||||||
|
return paddleaudio._paddleaudio.sox_utils_get_buffer_size()
|
@ -0,0 +1,192 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Unility functions for Transformer."""
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from .log import Logger
|
||||||
|
|
||||||
|
__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"]
|
||||||
|
|
||||||
|
logger = Logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def has_tensor(val):
|
||||||
|
if isinstance(val, (list, tuple)):
|
||||||
|
for item in val:
|
||||||
|
if has_tensor(item):
|
||||||
|
return True
|
||||||
|
elif isinstance(val, dict):
|
||||||
|
for k, v in val.items():
|
||||||
|
print(k)
|
||||||
|
if has_tensor(v):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return paddle.is_tensor(val)
|
||||||
|
|
||||||
|
|
||||||
|
def pad_sequence(sequences: List[paddle.Tensor],
|
||||||
|
batch_first: bool=False,
|
||||||
|
padding_value: float=0.0) -> paddle.Tensor:
|
||||||
|
r"""Pad a list of variable length Tensors with ``padding_value``
|
||||||
|
|
||||||
|
``pad_sequence`` stacks a list of Tensors along a new dimension,
|
||||||
|
and pads them to equal length. For example, if the input is list of
|
||||||
|
sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
|
||||||
|
otherwise.
|
||||||
|
|
||||||
|
`B` is batch size. It is equal to the number of elements in ``sequences``.
|
||||||
|
`T` is length of the longest sequence.
|
||||||
|
`L` is length of the sequence.
|
||||||
|
`*` is any number of trailing dimensions, including none.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from paddle.nn.utils.rnn import pad_sequence
|
||||||
|
>>> a = paddle.ones(25, 300)
|
||||||
|
>>> b = paddle.ones(22, 300)
|
||||||
|
>>> c = paddle.ones(15, 300)
|
||||||
|
>>> pad_sequence([a, b, c]).shape
|
||||||
|
paddle.Tensor([25, 3, 300])
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
|
||||||
|
where `T` is the length of the longest sequence. This function assumes
|
||||||
|
trailing dimensions and type of all the Tensors in sequences are same.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequences (list[Tensor]): list of variable length sequences.
|
||||||
|
batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
|
||||||
|
``T x B x *`` otherwise
|
||||||
|
padding_value (float, optional): value for padded elements. Default: 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
|
||||||
|
Tensor of size ``B x T x *`` otherwise
|
||||||
|
"""
|
||||||
|
|
||||||
|
# assuming trailing dimensions and type of all the Tensors
|
||||||
|
# in sequences are same and fetching those from sequences[0]
|
||||||
|
max_size = paddle.shape(sequences[0])
|
||||||
|
# (TODO Hui Zhang): slice not supprot `end==start`
|
||||||
|
# trailing_dims = max_size[1:]
|
||||||
|
trailing_dims = tuple(
|
||||||
|
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
|
||||||
|
max_len = max([s.shape[0] for s in sequences])
|
||||||
|
if batch_first:
|
||||||
|
out_dims = (len(sequences), max_len) + trailing_dims
|
||||||
|
else:
|
||||||
|
out_dims = (max_len, len(sequences)) + trailing_dims
|
||||||
|
out_tensor = paddle.full(out_dims, padding_value, sequences[0].dtype)
|
||||||
|
for i, tensor in enumerate(sequences):
|
||||||
|
length = tensor.shape[0]
|
||||||
|
# use index notation to prevent duplicate references to the tensor
|
||||||
|
if batch_first:
|
||||||
|
# TODO (Hui Zhang): set_value op not supprot `end==start`
|
||||||
|
# TODO (Hui Zhang): set_value op not support int16
|
||||||
|
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
|
||||||
|
# out_tensor[i, :length, ...] = tensor
|
||||||
|
if length != 0:
|
||||||
|
out_tensor[i, :length] = tensor
|
||||||
|
else:
|
||||||
|
out_tensor[i, length] = tensor
|
||||||
|
else:
|
||||||
|
# TODO (Hui Zhang): set_value op not supprot `end==start`
|
||||||
|
# out_tensor[:length, i, ...] = tensor
|
||||||
|
if length != 0:
|
||||||
|
out_tensor[:length, i] = tensor
|
||||||
|
else:
|
||||||
|
out_tensor[length, i] = tensor
|
||||||
|
|
||||||
|
return out_tensor
|
||||||
|
|
||||||
|
|
||||||
|
def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
|
||||||
|
ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Add <sos> and <eos> labels.
|
||||||
|
Args:
|
||||||
|
ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax)
|
||||||
|
sos (int): index of <sos>
|
||||||
|
eos (int): index of <eeos>
|
||||||
|
ignore_id (int): index of padding
|
||||||
|
Returns:
|
||||||
|
ys_in (paddle.Tensor) : (B, Lmax + 1)
|
||||||
|
ys_out (paddle.Tensor) : (B, Lmax + 1)
|
||||||
|
Examples:
|
||||||
|
>>> sos_id = 10
|
||||||
|
>>> eos_id = 11
|
||||||
|
>>> ignore_id = -1
|
||||||
|
>>> ys_pad
|
||||||
|
tensor([[ 1, 2, 3, 4, 5],
|
||||||
|
[ 4, 5, 6, -1, -1],
|
||||||
|
[ 7, 8, 9, -1, -1]], dtype=paddle.int32)
|
||||||
|
>>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
|
||||||
|
>>> ys_in
|
||||||
|
tensor([[10, 1, 2, 3, 4, 5],
|
||||||
|
[10, 4, 5, 6, 11, 11],
|
||||||
|
[10, 7, 8, 9, 11, 11]])
|
||||||
|
>>> ys_out
|
||||||
|
tensor([[ 1, 2, 3, 4, 5, 11],
|
||||||
|
[ 4, 5, 6, 11, -1, -1],
|
||||||
|
[ 7, 8, 9, 11, -1, -1]])
|
||||||
|
"""
|
||||||
|
# TODO(Hui Zhang): using comment code,
|
||||||
|
#_sos = paddle.to_tensor(
|
||||||
|
# [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
|
||||||
|
#_eos = paddle.to_tensor(
|
||||||
|
# [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
|
||||||
|
#ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys
|
||||||
|
#ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
|
||||||
|
#ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
|
||||||
|
#return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
|
||||||
|
B = ys_pad.shape[0]
|
||||||
|
_sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
|
||||||
|
_eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
|
||||||
|
ys_in = paddle.cat([_sos, ys_pad], dim=1)
|
||||||
|
mask_pad = (ys_in == ignore_id)
|
||||||
|
ys_in = ys_in.masked_fill(mask_pad, eos)
|
||||||
|
|
||||||
|
ys_out = paddle.cat([ys_pad, _eos], dim=1)
|
||||||
|
ys_out = ys_out.masked_fill(mask_pad, eos)
|
||||||
|
mask_eos = (ys_out == ignore_id)
|
||||||
|
ys_out = ys_out.masked_fill(mask_eos, eos)
|
||||||
|
ys_out = ys_out.masked_fill(mask_pad, ignore_id)
|
||||||
|
return ys_in, ys_out
|
||||||
|
|
||||||
|
|
||||||
|
def th_accuracy(pad_outputs: paddle.Tensor,
|
||||||
|
pad_targets: paddle.Tensor,
|
||||||
|
ignore_label: int) -> float:
|
||||||
|
"""Calculate accuracy.
|
||||||
|
Args:
|
||||||
|
pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
|
||||||
|
pad_targets (LongTensor): Target label tensors (B, Lmax, D).
|
||||||
|
ignore_label (int): Ignore label id.
|
||||||
|
Returns:
|
||||||
|
float: Accuracy value (0.0 - 1.0).
|
||||||
|
"""
|
||||||
|
pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
|
||||||
|
pad_outputs.shape[1]).argmax(2)
|
||||||
|
mask = pad_targets != ignore_label
|
||||||
|
#TODO(Hui Zhang): sum not support bool type
|
||||||
|
# numerator = paddle.sum(
|
||||||
|
# pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
|
||||||
|
numerator = (
|
||||||
|
pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
|
||||||
|
numerator = paddle.sum(numerator.type_as(pad_targets))
|
||||||
|
#TODO(Hui Zhang): sum not support bool type
|
||||||
|
# denominator = paddle.sum(mask)
|
||||||
|
denominator = paddle.sum(mask.type_as(pad_targets))
|
||||||
|
return float(numerator) / float(denominator)
|
@ -0,0 +1,72 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Timer',
|
||||||
|
'seconds_to_hms',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Timer(object):
|
||||||
|
'''Calculate runing speed and estimated time of arrival(ETA)'''
|
||||||
|
|
||||||
|
def __init__(self, total_step: int):
|
||||||
|
self.total_step = total_step
|
||||||
|
self.last_start_step = 0
|
||||||
|
self.current_step = 0
|
||||||
|
self._is_running = True
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.last_time = time.time()
|
||||||
|
self.start_time = time.time()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._is_running = False
|
||||||
|
self.end_time = time.time()
|
||||||
|
|
||||||
|
def count(self) -> int:
|
||||||
|
if not self.current_step >= self.total_step:
|
||||||
|
self.current_step += 1
|
||||||
|
return self.current_step
|
||||||
|
|
||||||
|
@property
|
||||||
|
def timing(self) -> float:
|
||||||
|
run_steps = self.current_step - self.last_start_step
|
||||||
|
self.last_start_step = self.current_step
|
||||||
|
time_used = time.time() - self.last_time
|
||||||
|
self.last_time = time.time()
|
||||||
|
return run_steps / time_used
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_running(self) -> bool:
|
||||||
|
return self._is_running
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eta(self) -> str:
|
||||||
|
if not self.is_running:
|
||||||
|
return '00:00:00'
|
||||||
|
scale = self.total_step / self.current_step
|
||||||
|
remaining_time = (time.time() - self.start_time) * scale
|
||||||
|
return seconds_to_hms(remaining_time)
|
||||||
|
|
||||||
|
|
||||||
|
def seconds_to_hms(seconds: int) -> str:
|
||||||
|
'''Convert the number of seconds to hh:mm:ss'''
|
||||||
|
h = math.floor(seconds / 3600)
|
||||||
|
m = math.floor((seconds - h * 3600) / 60)
|
||||||
|
s = int(seconds - h * 3600 - m * 60)
|
||||||
|
hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
|
||||||
|
return hms_str
|
@ -0,0 +1,293 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import contextlib
|
||||||
|
import inspect
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import subprocess as sp
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import distutils.command.clean
|
||||||
|
from setuptools import Command
|
||||||
|
from setuptools import find_packages
|
||||||
|
from setuptools import setup
|
||||||
|
from setuptools.command.develop import develop
|
||||||
|
from setuptools.command.test import test
|
||||||
|
|
||||||
|
from tools import setup_helpers
|
||||||
|
|
||||||
|
ROOT_DIR = Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
VERSION = '1.1.0'
|
||||||
|
COMMITID = 'none'
|
||||||
|
|
||||||
|
base = [
|
||||||
|
"kaldiio",
|
||||||
|
"librosa==0.8.1",
|
||||||
|
"scipy>=1.0.0",
|
||||||
|
"soundfile~=0.10",
|
||||||
|
"colorlog",
|
||||||
|
"pathos == 0.2.8",
|
||||||
|
"pybind11",
|
||||||
|
"parameterized",
|
||||||
|
"tqdm"
|
||||||
|
]
|
||||||
|
|
||||||
|
requirements = {
|
||||||
|
"install":
|
||||||
|
base,
|
||||||
|
"develop": [
|
||||||
|
"sox",
|
||||||
|
"soxbindings",
|
||||||
|
"pre-commit",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
def check_call(cmd: str, shell=False, executable=None):
|
||||||
|
try:
|
||||||
|
sp.check_call(
|
||||||
|
cmd.split(),
|
||||||
|
shell=shell,
|
||||||
|
executable="/bin/bash" if shell else executable)
|
||||||
|
except sp.CalledProcessError as e:
|
||||||
|
print(
|
||||||
|
f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
|
||||||
|
e.output,
|
||||||
|
file=sys.stderr)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False):
|
||||||
|
try:
|
||||||
|
|
||||||
|
if isinstance(cmd, (list, tuple)):
|
||||||
|
cmds = cmd
|
||||||
|
else:
|
||||||
|
cmds = cmd.split()
|
||||||
|
out_bytes = sp.check_output(cmds)
|
||||||
|
|
||||||
|
except sp.CalledProcessError as e:
|
||||||
|
out_bytes = e.output # Output generated before error
|
||||||
|
code = e.returncode # Return code
|
||||||
|
print(
|
||||||
|
f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
|
||||||
|
out_bytes,
|
||||||
|
file=sys.stderr)
|
||||||
|
return out_bytes.strip().decode('utf8')
|
||||||
|
|
||||||
|
def _run_cmd(cmd):
|
||||||
|
try:
|
||||||
|
return subprocess.check_output(
|
||||||
|
cmd, cwd=ROOT_DIR,
|
||||||
|
stderr=subprocess.DEVNULL).decode("ascii").strip()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def pushd(new_dir):
|
||||||
|
old_dir = os.getcwd()
|
||||||
|
os.chdir(new_dir)
|
||||||
|
print(new_dir)
|
||||||
|
yield
|
||||||
|
os.chdir(old_dir)
|
||||||
|
print(old_dir)
|
||||||
|
|
||||||
|
def read(*names, **kwargs):
|
||||||
|
with io.open(
|
||||||
|
os.path.join(os.path.dirname(__file__), *names),
|
||||||
|
encoding=kwargs.get("encoding", "utf8")) as fp:
|
||||||
|
return fp.read()
|
||||||
|
|
||||||
|
def _remove(files: str):
|
||||||
|
for f in files:
|
||||||
|
f.unlink()
|
||||||
|
|
||||||
|
################################# Install ##################################
|
||||||
|
|
||||||
|
|
||||||
|
def _post_install(install_lib_dir):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DevelopCommand(develop):
|
||||||
|
def run(self):
|
||||||
|
develop.run(self)
|
||||||
|
# must after develop.run, or pkg install by shell will not see
|
||||||
|
self.execute(_post_install, (self.install_lib, ), msg="Post Install...")
|
||||||
|
|
||||||
|
|
||||||
|
class TestCommand(test):
|
||||||
|
def finalize_options(self):
|
||||||
|
test.finalize_options(self)
|
||||||
|
self.test_args = []
|
||||||
|
self.test_suite = True
|
||||||
|
|
||||||
|
def run_tests(self):
|
||||||
|
# Run nose ensuring that argv simulates running nosetests directly
|
||||||
|
import nose
|
||||||
|
nose.run_exit(argv=['nosetests', '-w', 'tests'])
|
||||||
|
|
||||||
|
def run_benchmark(self):
|
||||||
|
for benchmark_item in glob.glob('tests/benchmark/*py'):
|
||||||
|
os.system(f'pytest {benchmark_item}')
|
||||||
|
|
||||||
|
|
||||||
|
# cmd: python setup.py upload
|
||||||
|
class UploadCommand(Command):
|
||||||
|
description = "Build and publish the package."
|
||||||
|
user_options = []
|
||||||
|
|
||||||
|
def initialize_options(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def finalize_options(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
try:
|
||||||
|
print("Removing previous dist/ ...")
|
||||||
|
shutil.rmtree(str(ROOT_DIR / "dist"))
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
print("Building source distribution...")
|
||||||
|
sp.check_call([sys.executable, "setup.py", "sdist"])
|
||||||
|
print("Uploading package to PyPi...")
|
||||||
|
sp.check_call(["twine", "upload", "dist/*"])
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
################################# Version ##################################
|
||||||
|
def _get_version(sha):
|
||||||
|
version = VERSION
|
||||||
|
if os.getenv("BUILD_VERSION"):
|
||||||
|
version = os.getenv("BUILD_VERSION")
|
||||||
|
elif sha is not None:
|
||||||
|
version += "+" + sha[:7]
|
||||||
|
return version
|
||||||
|
|
||||||
|
|
||||||
|
def _make_version_file(version, sha):
|
||||||
|
sha = "Unknown" if sha is None else sha
|
||||||
|
version_path = ROOT_DIR / "paddleaudio" / "__init__.py"
|
||||||
|
with open(version_path, "a") as f:
|
||||||
|
f.write(f"__version__ = '{version}'\n")
|
||||||
|
|
||||||
|
def _rm_version():
|
||||||
|
file_ = ROOT_DIR / "paddleaudio" / "__init__.py"
|
||||||
|
with open(file_, "r") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
with open(file_, "w") as f:
|
||||||
|
for line in lines:
|
||||||
|
if "__version__" not in line:
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
|
|
||||||
|
################################# Steup ##################################
|
||||||
|
class clean(distutils.command.clean.clean):
|
||||||
|
def run(self):
|
||||||
|
# Run default behavior first
|
||||||
|
distutils.command.clean.clean.run(self)
|
||||||
|
|
||||||
|
# Remove paddleaudio extension
|
||||||
|
for path in (ROOT_DIR / "paddleaudio").glob("**/*.so"):
|
||||||
|
print(f"removing '{path}'")
|
||||||
|
path.unlink()
|
||||||
|
# Remove build directory
|
||||||
|
build_dirs = [
|
||||||
|
ROOT_DIR / "build",
|
||||||
|
]
|
||||||
|
for path in build_dirs:
|
||||||
|
if path.exists():
|
||||||
|
print(f"removing '{path}' (and everything under it)")
|
||||||
|
shutil.rmtree(str(path), ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
sha = _run_cmd(["git", "rev-parse", "HEAD"]) # commit id
|
||||||
|
branch = _run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
|
||||||
|
tag = _run_cmd(["git", "describe", "--tags", "--exact-match", "@"])
|
||||||
|
print("-- Git branch:", branch)
|
||||||
|
print("-- Git SHA:", sha)
|
||||||
|
print("-- Git tag:", tag)
|
||||||
|
version = _get_version(sha)
|
||||||
|
print("-- Building version", version)
|
||||||
|
_rm_version()
|
||||||
|
|
||||||
|
_make_version_file(version, sha)
|
||||||
|
lib_package_data = {}
|
||||||
|
if platform.system() != 'Windows' and platform.system() != 'Linux':
|
||||||
|
lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']}
|
||||||
|
|
||||||
|
if platform.system() == 'Linux':
|
||||||
|
lib_package_data = {'paddleaudio': ['lib/lib*']}
|
||||||
|
|
||||||
|
setup_info = dict(
|
||||||
|
# Metadata
|
||||||
|
name='paddleaudio',
|
||||||
|
version=VERSION,
|
||||||
|
author='PaddlePaddle Speech and Language Team',
|
||||||
|
author_email='paddlesl@baidu.com',
|
||||||
|
url='https://github.com/PaddlePaddle/PaddleSpeech/audio',
|
||||||
|
license='Apache 2.0',
|
||||||
|
description='Speech audio tools based on Paddlepaddle',
|
||||||
|
keywords=[
|
||||||
|
"audio process"
|
||||||
|
"paddlepaddle",
|
||||||
|
],
|
||||||
|
python_requires='>=3.7',
|
||||||
|
install_requires=requirements["install"],
|
||||||
|
extras_require={
|
||||||
|
'develop':
|
||||||
|
requirements["develop"],
|
||||||
|
#'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"],
|
||||||
|
},
|
||||||
|
cmdclass={
|
||||||
|
"build_ext": setup_helpers.CMakeBuild,
|
||||||
|
'develop': DevelopCommand,
|
||||||
|
'test': TestCommand,
|
||||||
|
'upload': UploadCommand,
|
||||||
|
"clean": clean,
|
||||||
|
},
|
||||||
|
|
||||||
|
# Package info
|
||||||
|
packages=find_packages(include=('paddleaudio*')),
|
||||||
|
package_data=lib_package_data,
|
||||||
|
ext_modules=setup_helpers.get_ext_modules(),
|
||||||
|
zip_safe=True,
|
||||||
|
classifiers=[
|
||||||
|
'Development Status :: 5 - Production/Stable',
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'Intended Audience :: Science/Research',
|
||||||
|
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||||
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Programming Language :: Python :: 3.6',
|
||||||
|
'Programming Language :: Python :: 3.7',
|
||||||
|
'Programming Language :: Python :: 3.8',
|
||||||
|
'Programming Language :: Python :: 3.9',
|
||||||
|
'Programming Language :: Python :: 3.10',
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
setup(**setup_info)
|
||||||
|
_rm_version()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
def get_encoding(ext, dtype):
|
||||||
|
exts = {
|
||||||
|
"mp3",
|
||||||
|
"flac",
|
||||||
|
"vorbis",
|
||||||
|
}
|
||||||
|
encodings = {
|
||||||
|
"float32": "PCM_F",
|
||||||
|
"int32": "PCM_S",
|
||||||
|
"int16": "PCM_S",
|
||||||
|
"uint8": "PCM_U",
|
||||||
|
}
|
||||||
|
return ext.upper() if ext in exts else encodings[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bit_depth(dtype):
|
||||||
|
bit_depths = {
|
||||||
|
"float32": 32,
|
||||||
|
"int32": 32,
|
||||||
|
"int16": 16,
|
||||||
|
"uint8": 8,
|
||||||
|
}
|
||||||
|
return bit_depths[dtype]
|
||||||
|
|
||||||
|
def get_bits_per_sample(ext, dtype):
|
||||||
|
bits_per_samples = {
|
||||||
|
"flac": 24,
|
||||||
|
"mp3": 0,
|
||||||
|
"vorbis": 0,
|
||||||
|
}
|
||||||
|
return bits_per_samples.get(ext, get_bit_depth(dtype))
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
|
||||||
|
multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav'
|
||||||
|
|
||||||
|
|
||||||
|
class BackendTest(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.initWavInput()
|
||||||
|
|
||||||
|
def initWavInput(self):
|
||||||
|
self.files = []
|
||||||
|
for url in [mono_channel_wav, multi_channels_wav]:
|
||||||
|
if not os.path.isfile(os.path.basename(url)):
|
||||||
|
urllib.request.urlretrieve(url, os.path.basename(url))
|
||||||
|
self.files.append(os.path.basename(url))
|
||||||
|
|
||||||
|
def initParmas(self):
|
||||||
|
raise NotImplementedError
|
@ -0,0 +1,89 @@
|
|||||||
|
import itertools
|
||||||
|
from unittest import skipIf
|
||||||
|
|
||||||
|
from paddleaudio._internal.module_utils import is_module_available
|
||||||
|
from parameterized import parameterized
|
||||||
|
|
||||||
|
|
||||||
|
def name_func(func, _, params):
|
||||||
|
return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
|
||||||
|
|
||||||
|
|
||||||
|
def dtype2subtype(dtype):
|
||||||
|
return {
|
||||||
|
"float64": "DOUBLE",
|
||||||
|
"float32": "FLOAT",
|
||||||
|
"int32": "PCM_32",
|
||||||
|
"int16": "PCM_16",
|
||||||
|
"uint8": "PCM_U8",
|
||||||
|
"int8": "PCM_S8",
|
||||||
|
}[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def skipIfFormatNotSupported(fmt):
|
||||||
|
fmts = []
|
||||||
|
if is_module_available("soundfile"):
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
fmts = soundfile.available_formats()
|
||||||
|
return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
|
||||||
|
return skipIf(True, '"soundfile" not available.')
|
||||||
|
|
||||||
|
|
||||||
|
def parameterize(*params):
|
||||||
|
return parameterized.expand(
|
||||||
|
list(itertools.product(*params)), name_func=name_func)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_wav_subtype(dtype, encoding, bits_per_sample):
|
||||||
|
subtype = {
|
||||||
|
(None, None): dtype2subtype(dtype),
|
||||||
|
(None, 8): "PCM_U8",
|
||||||
|
("PCM_U", None): "PCM_U8",
|
||||||
|
("PCM_U", 8): "PCM_U8",
|
||||||
|
("PCM_S", None): "PCM_32",
|
||||||
|
("PCM_S", 16): "PCM_16",
|
||||||
|
("PCM_S", 32): "PCM_32",
|
||||||
|
("PCM_F", None): "FLOAT",
|
||||||
|
("PCM_F", 32): "FLOAT",
|
||||||
|
("PCM_F", 64): "DOUBLE",
|
||||||
|
("ULAW", None): "ULAW",
|
||||||
|
("ULAW", 8): "ULAW",
|
||||||
|
("ALAW", None): "ALAW",
|
||||||
|
("ALAW", 8): "ALAW",
|
||||||
|
}.get((encoding, bits_per_sample))
|
||||||
|
if subtype:
|
||||||
|
return subtype
|
||||||
|
raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
|
||||||
|
|
||||||
|
def get_encoding(ext, dtype):
|
||||||
|
exts = {
|
||||||
|
"mp3",
|
||||||
|
"flac",
|
||||||
|
"vorbis",
|
||||||
|
}
|
||||||
|
encodings = {
|
||||||
|
"float32": "PCM_F",
|
||||||
|
"int32": "PCM_S",
|
||||||
|
"int16": "PCM_S",
|
||||||
|
"uint8": "PCM_U",
|
||||||
|
}
|
||||||
|
return ext.upper() if ext in exts else encodings[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bit_depth(dtype):
|
||||||
|
bit_depths = {
|
||||||
|
"float32": 32,
|
||||||
|
"int32": 32,
|
||||||
|
"int16": 16,
|
||||||
|
"uint8": 8,
|
||||||
|
}
|
||||||
|
return bit_depths[dtype]
|
||||||
|
|
||||||
|
def get_bits_per_sample(ext, dtype):
|
||||||
|
bits_per_samples = {
|
||||||
|
"flac": 24,
|
||||||
|
"mp3": 0,
|
||||||
|
"vorbis": 0,
|
||||||
|
}
|
||||||
|
return bits_per_samples.get(ext, get_bit_depth(dtype))
|
@ -0,0 +1 @@
|
|||||||
|
../../common_utils
|
@ -0,0 +1,199 @@
|
|||||||
|
#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py
|
||||||
|
import tarfile
|
||||||
|
import unittest
|
||||||
|
import warnings
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import soundfile
|
||||||
|
from common import get_bits_per_sample
|
||||||
|
from common import get_encoding
|
||||||
|
from common import parameterize
|
||||||
|
from common import skipIfFormatNotSupported
|
||||||
|
from common_utils import get_wav_data
|
||||||
|
from common_utils import nested_params
|
||||||
|
from common_utils import save_wav
|
||||||
|
from common_utils import TempDirMixin
|
||||||
|
from paddleaudio.backends import soundfile_backend
|
||||||
|
|
||||||
|
|
||||||
|
class TestInfo(TempDirMixin, unittest.TestCase):
|
||||||
|
@parameterize(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.info` can check wav file correctly"""
|
||||||
|
duration = 1
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
data = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
normalize=False,
|
||||||
|
num_frames=duration * sample_rate)
|
||||||
|
save_wav(path, data, sample_rate)
|
||||||
|
info = soundfile_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_rate * duration
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == get_bits_per_sample("wav", dtype)
|
||||||
|
assert info.encoding == get_encoding("wav", dtype)
|
||||||
|
|
||||||
|
@parameterize([8000, 16000], [1, 2])
|
||||||
|
@skipIfFormatNotSupported("FLAC")
|
||||||
|
def test_flac(self, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.info` can check flac file correctly"""
|
||||||
|
duration = 1
|
||||||
|
num_frames = sample_rate * duration
|
||||||
|
#data = torch.randn(num_frames, num_channels).numpy()
|
||||||
|
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
|
||||||
|
|
||||||
|
path = self.get_temp_path("data.flac")
|
||||||
|
soundfile.write(path, data, sample_rate)
|
||||||
|
|
||||||
|
info = soundfile_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == num_frames
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == 16
|
||||||
|
assert info.encoding == "FLAC"
|
||||||
|
|
||||||
|
#@parameterize([8000, 16000], [1, 2])
|
||||||
|
#@skipIfFormatNotSupported("OGG")
|
||||||
|
#def test_ogg(self, sample_rate, num_channels):
|
||||||
|
#"""`soundfile_backend.info` can check ogg file correctly"""
|
||||||
|
#duration = 1
|
||||||
|
#num_frames = sample_rate * duration
|
||||||
|
##data = torch.randn(num_frames, num_channels).numpy()
|
||||||
|
#data = paddle.randn(shape=[num_frames, num_channels]).numpy()
|
||||||
|
#print(len(data))
|
||||||
|
#path = self.get_temp_path("data.ogg")
|
||||||
|
#soundfile.write(path, data, sample_rate)
|
||||||
|
|
||||||
|
#info = soundfile_backend.info(path)
|
||||||
|
#print(info)
|
||||||
|
#assert info.sample_rate == sample_rate
|
||||||
|
#print("info")
|
||||||
|
#print(info.num_frames)
|
||||||
|
#print("jiji")
|
||||||
|
#print(sample_rate*duration)
|
||||||
|
##assert info.num_frames == sample_rate * duration
|
||||||
|
#assert info.num_channels == num_channels
|
||||||
|
#assert info.bits_per_sample == 0
|
||||||
|
#assert info.encoding == "VORBIS"
|
||||||
|
|
||||||
|
@nested_params(
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[("PCM_24", 24), ("PCM_32", 32)], )
|
||||||
|
@skipIfFormatNotSupported("NIST")
|
||||||
|
def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth):
|
||||||
|
"""`soundfile_backend.info` can check sph file correctly"""
|
||||||
|
duration = 1
|
||||||
|
num_frames = sample_rate * duration
|
||||||
|
#data = torch.randn(num_frames, num_channels).numpy()
|
||||||
|
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
|
||||||
|
path = self.get_temp_path("data.nist")
|
||||||
|
subtype, bits_per_sample = subtype_and_bit_depth
|
||||||
|
soundfile.write(path, data, sample_rate, subtype=subtype)
|
||||||
|
|
||||||
|
info = soundfile_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_rate * duration
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == bits_per_sample
|
||||||
|
assert info.encoding == "PCM_S"
|
||||||
|
|
||||||
|
def test_unknown_subtype_warning(self):
|
||||||
|
"""soundfile_backend.info issues a warning when the subtype is unknown
|
||||||
|
|
||||||
|
This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE
|
||||||
|
dict should be updated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _mock_info_func(_):
|
||||||
|
class MockSoundFileInfo:
|
||||||
|
samplerate = 8000
|
||||||
|
frames = 356
|
||||||
|
channels = 2
|
||||||
|
subtype = "UNSEEN_SUBTYPE"
|
||||||
|
format = "UNKNOWN"
|
||||||
|
|
||||||
|
return MockSoundFileInfo()
|
||||||
|
|
||||||
|
with patch("soundfile.info", _mock_info_func):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
info = soundfile_backend.info("foo")
|
||||||
|
assert len(w) == 1
|
||||||
|
assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(
|
||||||
|
w[-1].message)
|
||||||
|
assert info.bits_per_sample == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileObject(TempDirMixin, unittest.TestCase):
|
||||||
|
def _test_fileobj(self, ext, subtype, bits_per_sample):
|
||||||
|
"""Query audio via file-like object works"""
|
||||||
|
duration = 2
|
||||||
|
sample_rate = 16000
|
||||||
|
num_channels = 2
|
||||||
|
num_frames = sample_rate * duration
|
||||||
|
path = self.get_temp_path(f"test.{ext}")
|
||||||
|
|
||||||
|
#data = torch.randn(num_frames, num_channels).numpy()
|
||||||
|
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
|
||||||
|
soundfile.write(path, data, sample_rate, subtype=subtype)
|
||||||
|
|
||||||
|
with open(path, "rb") as fileobj:
|
||||||
|
info = soundfile_backend.info(fileobj)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == num_frames
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == bits_per_sample
|
||||||
|
assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
|
||||||
|
|
||||||
|
def test_fileobj_wav(self):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
self._test_fileobj("wav", "PCM_16", 16)
|
||||||
|
|
||||||
|
@skipIfFormatNotSupported("FLAC")
|
||||||
|
def test_fileobj_flac(self):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
self._test_fileobj("flac", "PCM_16", 16)
|
||||||
|
|
||||||
|
def _test_tarobj(self, ext, subtype, bits_per_sample):
|
||||||
|
"""Query compressed audio via file-like object works"""
|
||||||
|
duration = 2
|
||||||
|
sample_rate = 16000
|
||||||
|
num_channels = 2
|
||||||
|
num_frames = sample_rate * duration
|
||||||
|
audio_file = f"test.{ext}"
|
||||||
|
audio_path = self.get_temp_path(audio_file)
|
||||||
|
archive_path = self.get_temp_path("archive.tar.gz")
|
||||||
|
|
||||||
|
#data = torch.randn(num_frames, num_channels).numpy()
|
||||||
|
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
|
||||||
|
soundfile.write(audio_path, data, sample_rate, subtype=subtype)
|
||||||
|
|
||||||
|
with tarfile.TarFile(archive_path, "w") as tarobj:
|
||||||
|
tarobj.add(audio_path, arcname=audio_file)
|
||||||
|
with tarfile.TarFile(archive_path, "r") as tarobj:
|
||||||
|
fileobj = tarobj.extractfile(audio_file)
|
||||||
|
info = soundfile_backend.info(fileobj)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == num_frames
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == bits_per_sample
|
||||||
|
assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
|
||||||
|
|
||||||
|
def test_tarobj_wav(self):
|
||||||
|
"""Query compressed audio via file-like object works"""
|
||||||
|
self._test_tarobj("wav", "PCM_16", 16)
|
||||||
|
|
||||||
|
@skipIfFormatNotSupported("FLAC")
|
||||||
|
def test_tarobj_flac(self):
|
||||||
|
"""Query compressed audio via file-like object works"""
|
||||||
|
self._test_tarobj("flac", "PCM_16", 16)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,363 @@
|
|||||||
|
#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py
|
||||||
|
import os
|
||||||
|
import tarfile
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import soundfile
|
||||||
|
from common import dtype2subtype
|
||||||
|
from common import parameterize
|
||||||
|
from common import skipIfFormatNotSupported
|
||||||
|
from common_utils import get_wav_data
|
||||||
|
from common_utils import load_wav
|
||||||
|
from common_utils import normalize_wav
|
||||||
|
from common_utils import save_wav
|
||||||
|
from common_utils import TempDirMixin
|
||||||
|
from paddleaudio.backends import soundfile_backend
|
||||||
|
from parameterized import parameterized
|
||||||
|
|
||||||
|
|
||||||
|
def _get_mock_path(
|
||||||
|
ext: str,
|
||||||
|
dtype: str,
|
||||||
|
sample_rate: int,
|
||||||
|
num_channels: int,
|
||||||
|
num_frames: int, ):
|
||||||
|
return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_mock_params(path: str):
|
||||||
|
filename, ext = path.split(".")
|
||||||
|
parts = filename.split("_")
|
||||||
|
return {
|
||||||
|
"ext": ext,
|
||||||
|
"dtype": parts[0],
|
||||||
|
"sample_rate": int(parts[1]),
|
||||||
|
"num_channels": int(parts[2]),
|
||||||
|
"num_frames": int(parts[3]),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SoundFileMock:
|
||||||
|
def __init__(self, path, mode):
|
||||||
|
assert mode == "r"
|
||||||
|
self.path = path
|
||||||
|
self._params = _get_mock_params(path)
|
||||||
|
self._start = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def samplerate(self):
|
||||||
|
return self._params["sample_rate"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def format(self):
|
||||||
|
if self._params["ext"] == "wav":
|
||||||
|
return "WAV"
|
||||||
|
if self._params["ext"] == "flac":
|
||||||
|
return "FLAC"
|
||||||
|
if self._params["ext"] == "ogg":
|
||||||
|
return "OGG"
|
||||||
|
if self._params["ext"] in ["sph", "nis", "nist"]:
|
||||||
|
return "NIST"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def subtype(self):
|
||||||
|
if self._params["ext"] == "ogg":
|
||||||
|
return "VORBIS"
|
||||||
|
return dtype2subtype(self._params["dtype"])
|
||||||
|
|
||||||
|
def _prepare_read(self, start, stop, frames):
|
||||||
|
assert stop is None
|
||||||
|
self._start = start
|
||||||
|
return frames
|
||||||
|
|
||||||
|
def read(self, frames, dtype, always_2d):
|
||||||
|
assert always_2d
|
||||||
|
data = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
self._params["num_channels"],
|
||||||
|
normalize=False,
|
||||||
|
num_frames=self._params["num_frames"],
|
||||||
|
channels_first=False, ).numpy()
|
||||||
|
return data[self._start:self._start + frames]
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MockedLoadTest(unittest.TestCase):
|
||||||
|
def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first):
|
||||||
|
"""When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32"""
|
||||||
|
num_frames = 3 * sample_rate
|
||||||
|
path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames)
|
||||||
|
expected_dtype = paddle.float32 if normalize or ext not in [
|
||||||
|
"wav", "nist"
|
||||||
|
] else getattr(paddle, dtype)
|
||||||
|
with patch("soundfile.SoundFile", SoundFileMock):
|
||||||
|
found, sr = soundfile_backend.load(
|
||||||
|
path, normalize=normalize, channels_first=channels_first)
|
||||||
|
assert found.dtype == expected_dtype
|
||||||
|
assert sample_rate == sr
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["int32", "float32", "float64"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[True, False],
|
||||||
|
[True, False], )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first):
|
||||||
|
"""Returns native dtype when normalize=False else float32"""
|
||||||
|
self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[True, False],
|
||||||
|
[True, False], )
|
||||||
|
def test_sphere(self, dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first):
|
||||||
|
"""Returns float32 always"""
|
||||||
|
self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
@parameterize([8000, 16000], [1, 2], [True, False], [True, False])
|
||||||
|
def test_ogg(self, sample_rate, num_channels, normalize, channels_first):
|
||||||
|
"""Returns float32 always"""
|
||||||
|
self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
@parameterize([8000, 16000], [1, 2], [True, False], [True, False])
|
||||||
|
def test_flac(self, sample_rate, num_channels, normalize, channels_first):
|
||||||
|
"""`soundfile_backend.load` can load ogg format."""
|
||||||
|
self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
|
||||||
|
class LoadTestBase(TempDirMixin, unittest.TestCase):
|
||||||
|
def assert_wav(
|
||||||
|
self,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
normalize,
|
||||||
|
channels_first=True,
|
||||||
|
duration=1, ):
|
||||||
|
"""`soundfile_backend.load` can load wav format correctly.
|
||||||
|
|
||||||
|
Wav data loaded with soundfile backend should match those with scipy
|
||||||
|
"""
|
||||||
|
path = self.get_temp_path("reference.wav")
|
||||||
|
num_frames = duration * sample_rate
|
||||||
|
data = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
normalize=normalize,
|
||||||
|
num_frames=num_frames,
|
||||||
|
channels_first=channels_first, )
|
||||||
|
save_wav(path, data, sample_rate, channels_first=channels_first)
|
||||||
|
expected = load_wav(
|
||||||
|
path, normalize=normalize, channels_first=channels_first)[0]
|
||||||
|
data, sr = soundfile_backend.load(
|
||||||
|
path, normalize=normalize, channels_first=channels_first)
|
||||||
|
assert sr == sample_rate
|
||||||
|
np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
def assert_sphere(
|
||||||
|
self,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
channels_first=True,
|
||||||
|
duration=1, ):
|
||||||
|
"""`soundfile_backend.load` can load SPHERE format correctly."""
|
||||||
|
path = self.get_temp_path("reference.sph")
|
||||||
|
num_frames = duration * sample_rate
|
||||||
|
raw = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
num_frames=num_frames,
|
||||||
|
normalize=False,
|
||||||
|
channels_first=False, )
|
||||||
|
soundfile.write(
|
||||||
|
path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST")
|
||||||
|
expected = normalize_wav(raw.t() if channels_first else raw)
|
||||||
|
data, sr = soundfile_backend.load(path, channels_first=channels_first)
|
||||||
|
assert sr == sample_rate
|
||||||
|
#self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
|
||||||
|
np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
def assert_flac(
|
||||||
|
self,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
channels_first=True,
|
||||||
|
duration=1, ):
|
||||||
|
"""`soundfile_backend.load` can load FLAC format correctly."""
|
||||||
|
path = self.get_temp_path("reference.flac")
|
||||||
|
num_frames = duration * sample_rate
|
||||||
|
raw = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
num_frames=num_frames,
|
||||||
|
normalize=False,
|
||||||
|
channels_first=False, )
|
||||||
|
soundfile.write(path, raw, sample_rate)
|
||||||
|
expected = normalize_wav(raw.t() if channels_first else raw)
|
||||||
|
data, sr = soundfile_backend.load(path, channels_first=channels_first)
|
||||||
|
assert sr == sample_rate
|
||||||
|
#self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
|
||||||
|
np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoad(LoadTestBase):
|
||||||
|
"""Test the correctness of `soundfile_backend.load` for various formats"""
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True],
|
||||||
|
[False, True], )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first):
|
||||||
|
"""`soundfile_backend.load` can load wav format correctly."""
|
||||||
|
self.assert_wav(dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["int32"],
|
||||||
|
[16000],
|
||||||
|
[2],
|
||||||
|
[False], )
|
||||||
|
def test_wav_large(self, dtype, sample_rate, num_channels, normalize):
|
||||||
|
"""`soundfile_backend.load` can load large wav file correctly."""
|
||||||
|
two_hours = 2 * 60 * 60
|
||||||
|
self.assert_wav(
|
||||||
|
dtype, sample_rate, num_channels, normalize, duration=two_hours)
|
||||||
|
|
||||||
|
@parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True])
|
||||||
|
def test_multiple_channels(self, dtype, num_channels, channels_first):
|
||||||
|
"""`soundfile_backend.load` can load wav file with more than 2 channels."""
|
||||||
|
sample_rate = 8000
|
||||||
|
normalize = False
|
||||||
|
self.assert_wav(dtype, sample_rate, num_channels, normalize,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
#@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
|
||||||
|
#@skipIfFormatNotSupported("NIST")
|
||||||
|
#def test_sphere(self, dtype, sample_rate, num_channels, channels_first):
|
||||||
|
#"""`soundfile_backend.load` can load sphere format correctly."""
|
||||||
|
#self.assert_sphere(dtype, sample_rate, num_channels, channels_first)
|
||||||
|
|
||||||
|
#@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
|
||||||
|
#@skipIfFormatNotSupported("FLAC")
|
||||||
|
#def test_flac(self, dtype, sample_rate, num_channels, channels_first):
|
||||||
|
#"""`soundfile_backend.load` can load flac format correctly."""
|
||||||
|
#self.assert_flac(dtype, sample_rate, num_channels, channels_first)
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoadFormat(TempDirMixin, unittest.TestCase):
|
||||||
|
"""Given `format` parameter, `so.load` can load files without extension"""
|
||||||
|
|
||||||
|
original = None
|
||||||
|
path = None
|
||||||
|
|
||||||
|
def _make_file(self, format_):
|
||||||
|
sample_rate = 8000
|
||||||
|
path_with_ext = self.get_temp_path(f"test.{format_}")
|
||||||
|
data = get_wav_data("float32", num_channels=2).numpy().T
|
||||||
|
soundfile.write(path_with_ext, data, sample_rate)
|
||||||
|
expected = soundfile.read(path_with_ext, dtype="float32")[0].T
|
||||||
|
path = os.path.splitext(path_with_ext)[0]
|
||||||
|
os.rename(path_with_ext, path)
|
||||||
|
return path, expected
|
||||||
|
|
||||||
|
def _test_format(self, format_):
|
||||||
|
"""Providing format allows to read file without extension"""
|
||||||
|
path, expected = self._make_file(format_)
|
||||||
|
found, _ = soundfile_backend.load(path)
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(found, expected)
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("WAV", ),
|
||||||
|
("wav", ),
|
||||||
|
])
|
||||||
|
def test_wav(self, format_):
|
||||||
|
self._test_format(format_)
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("FLAC", ),
|
||||||
|
("flac", ),
|
||||||
|
])
|
||||||
|
@skipIfFormatNotSupported("FLAC")
|
||||||
|
def test_flac(self, format_):
|
||||||
|
self._test_format(format_)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileObject(TempDirMixin, unittest.TestCase):
|
||||||
|
def _test_fileobj(self, ext):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
path = self.get_temp_path(f"test.{ext}")
|
||||||
|
|
||||||
|
data = get_wav_data("float32", num_channels=2).numpy().T
|
||||||
|
soundfile.write(path, data, sample_rate)
|
||||||
|
expected = soundfile.read(path, dtype="float32")[0].T
|
||||||
|
|
||||||
|
with open(path, "rb") as fileobj:
|
||||||
|
found, sr = soundfile_backend.load(fileobj)
|
||||||
|
assert sr == sample_rate
|
||||||
|
#self.assertEqual(expected, found)
|
||||||
|
np.testing.assert_array_almost_equal(found, expected)
|
||||||
|
|
||||||
|
def test_fileobj_wav(self):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
self._test_fileobj("wav")
|
||||||
|
|
||||||
|
def test_fileobj_flac(self):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
self._test_fileobj("flac")
|
||||||
|
|
||||||
|
def _test_tarfile(self, ext):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
audio_file = f"test.{ext}"
|
||||||
|
audio_path = self.get_temp_path(audio_file)
|
||||||
|
archive_path = self.get_temp_path("archive.tar.gz")
|
||||||
|
|
||||||
|
data = get_wav_data("float32", num_channels=2).numpy().T
|
||||||
|
soundfile.write(audio_path, data, sample_rate)
|
||||||
|
expected = soundfile.read(audio_path, dtype="float32")[0].T
|
||||||
|
|
||||||
|
with tarfile.TarFile(archive_path, "w") as tarobj:
|
||||||
|
tarobj.add(audio_path, arcname=audio_file)
|
||||||
|
with tarfile.TarFile(archive_path, "r") as tarobj:
|
||||||
|
fileobj = tarobj.extractfile(audio_file)
|
||||||
|
found, sr = soundfile_backend.load(fileobj)
|
||||||
|
|
||||||
|
assert sr == sample_rate
|
||||||
|
#self.assertEqual(expected, found)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected)
|
||||||
|
|
||||||
|
def test_tarfile_wav(self):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
self._test_tarfile("wav")
|
||||||
|
|
||||||
|
def test_tarfile_flac(self):
|
||||||
|
"""Loading audio via file-like object works"""
|
||||||
|
self._test_tarfile("flac")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,323 @@
|
|||||||
|
import io
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import soundfile
|
||||||
|
from common import fetch_wav_subtype
|
||||||
|
from common import parameterize
|
||||||
|
from common import skipIfFormatNotSupported
|
||||||
|
from common_utils import get_wav_data
|
||||||
|
from common_utils import load_wav
|
||||||
|
from common_utils import nested_params
|
||||||
|
from common_utils import TempDirMixin
|
||||||
|
from paddleaudio.backends import soundfile_backend
|
||||||
|
|
||||||
|
|
||||||
|
class MockedSaveTest(unittest.TestCase):
|
||||||
|
@nested_params(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True],
|
||||||
|
[
|
||||||
|
(None, None),
|
||||||
|
("PCM_U", None),
|
||||||
|
("PCM_U", 8),
|
||||||
|
("PCM_S", None),
|
||||||
|
("PCM_S", 16),
|
||||||
|
("PCM_S", 32),
|
||||||
|
("PCM_F", None),
|
||||||
|
("PCM_F", 32),
|
||||||
|
("PCM_F", 64),
|
||||||
|
("ULAW", None),
|
||||||
|
("ULAW", 8),
|
||||||
|
("ALAW", None),
|
||||||
|
("ALAW", 8),
|
||||||
|
], )
|
||||||
|
@patch("soundfile.write")
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, channels_first,
|
||||||
|
enc_params, mocked_write):
|
||||||
|
"""soundfile_backend.save passes correct subtype to soundfile.write when WAV"""
|
||||||
|
filepath = "foo.wav"
|
||||||
|
input_tensor = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
num_frames=3 * sample_rate,
|
||||||
|
normalize=dtype == "float32",
|
||||||
|
channels_first=channels_first, )
|
||||||
|
input_tensor = paddle.transpose(input_tensor, [1, 0])
|
||||||
|
|
||||||
|
encoding, bits_per_sample = enc_params
|
||||||
|
soundfile_backend.save(
|
||||||
|
filepath,
|
||||||
|
input_tensor,
|
||||||
|
sample_rate,
|
||||||
|
channels_first=channels_first,
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample, )
|
||||||
|
|
||||||
|
# on +Py3.8 call_args.kwargs is more descreptive
|
||||||
|
args = mocked_write.call_args[1]
|
||||||
|
assert args["file"] == filepath
|
||||||
|
assert args["samplerate"] == sample_rate
|
||||||
|
assert args["subtype"] == fetch_wav_subtype(dtype, encoding,
|
||||||
|
bits_per_sample)
|
||||||
|
assert args["format"] is None
|
||||||
|
tensor_result = paddle.transpose(
|
||||||
|
input_tensor, [1, 0]) if channels_first else input_tensor
|
||||||
|
#self.assertEqual(args["data"], tensor_result.numpy())
|
||||||
|
np.testing.assert_array_almost_equal(args["data"].numpy(),
|
||||||
|
tensor_result.numpy())
|
||||||
|
|
||||||
|
@patch("soundfile.write")
|
||||||
|
def assert_non_wav(
|
||||||
|
self,
|
||||||
|
fmt,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
channels_first,
|
||||||
|
mocked_write,
|
||||||
|
encoding=None,
|
||||||
|
bits_per_sample=None, ):
|
||||||
|
"""soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE"""
|
||||||
|
filepath = f"foo.{fmt}"
|
||||||
|
input_tensor = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
num_frames=3 * sample_rate,
|
||||||
|
normalize=False,
|
||||||
|
channels_first=channels_first, )
|
||||||
|
input_tensor = paddle.transpose(input_tensor, [1, 0])
|
||||||
|
|
||||||
|
expected_data = paddle.transpose(
|
||||||
|
input_tensor, [1, 0]) if channels_first else input_tensor
|
||||||
|
|
||||||
|
soundfile_backend.save(
|
||||||
|
filepath,
|
||||||
|
input_tensor,
|
||||||
|
sample_rate,
|
||||||
|
channels_first,
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample, )
|
||||||
|
|
||||||
|
# on +Py3.8 call_args.kwargs is more descreptive
|
||||||
|
args = mocked_write.call_args[1]
|
||||||
|
assert args["file"] == filepath
|
||||||
|
assert args["samplerate"] == sample_rate
|
||||||
|
if fmt in ["sph", "nist", "nis"]:
|
||||||
|
assert args["format"] == "NIST"
|
||||||
|
else:
|
||||||
|
assert args["format"] is None
|
||||||
|
np.testing.assert_array_almost_equal(args["data"].numpy(),
|
||||||
|
expected_data.numpy())
|
||||||
|
#self.assertEqual(args["data"], expected_data)
|
||||||
|
|
||||||
|
@nested_params(
|
||||||
|
["sph", "nist", "nis"],
|
||||||
|
["int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True],
|
||||||
|
[
|
||||||
|
("PCM_S", 8),
|
||||||
|
("PCM_S", 16),
|
||||||
|
("PCM_S", 24),
|
||||||
|
("PCM_S", 32),
|
||||||
|
("ULAW", 8),
|
||||||
|
("ALAW", 8),
|
||||||
|
("ALAW", 16),
|
||||||
|
("ALAW", 24),
|
||||||
|
("ALAW", 32),
|
||||||
|
], )
|
||||||
|
def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first,
|
||||||
|
enc_params):
|
||||||
|
"""soundfile_backend.save passes default format and subtype (None-s) to
|
||||||
|
soundfile.write when not WAV"""
|
||||||
|
encoding, bits_per_sample = enc_params
|
||||||
|
self.assert_non_wav(
|
||||||
|
fmt,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
channels_first,
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True],
|
||||||
|
[8, 16, 24], )
|
||||||
|
def test_flac(self, dtype, sample_rate, num_channels, channels_first,
|
||||||
|
bits_per_sample):
|
||||||
|
"""soundfile_backend.save passes default format and subtype (None-s) to
|
||||||
|
soundfile.write when not WAV"""
|
||||||
|
self.assert_non_wav(
|
||||||
|
"flac",
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
channels_first,
|
||||||
|
bits_per_sample=bits_per_sample)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True], )
|
||||||
|
def test_ogg(self, dtype, sample_rate, num_channels, channels_first):
|
||||||
|
"""soundfile_backend.save passes default format and subtype (None-s) to
|
||||||
|
soundfile.write when not WAV"""
|
||||||
|
self.assert_non_wav("ogg", dtype, sample_rate, num_channels,
|
||||||
|
channels_first)
|
||||||
|
|
||||||
|
|
||||||
|
class SaveTestBase(TempDirMixin, unittest.TestCase):
|
||||||
|
def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
|
||||||
|
"""`soundfile_backend.save` can save wav format."""
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
expected = get_wav_data(
|
||||||
|
dtype, num_channels, num_frames=num_frames, normalize=False)
|
||||||
|
soundfile_backend.save(path, expected, sample_rate)
|
||||||
|
found, sr = load_wav(path, normalize=False)
|
||||||
|
assert sample_rate == sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save non-wav format.
|
||||||
|
|
||||||
|
Due to precision missmatch, and the lack of alternative way to decode the
|
||||||
|
resulting files without using soundfile, only meta data are validated.
|
||||||
|
"""
|
||||||
|
num_frames = sample_rate * 3
|
||||||
|
path = self.get_temp_path(f"data.{fmt}")
|
||||||
|
expected = get_wav_data(
|
||||||
|
dtype, num_channels, num_frames=num_frames, normalize=False)
|
||||||
|
soundfile_backend.save(path, expected, sample_rate)
|
||||||
|
sinfo = soundfile.info(path)
|
||||||
|
assert sinfo.format == fmt.upper()
|
||||||
|
#assert sinfo.frames == num_frames this go wrong
|
||||||
|
assert sinfo.channels == num_channels
|
||||||
|
assert sinfo.samplerate == sample_rate
|
||||||
|
|
||||||
|
def assert_flac(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save flac format."""
|
||||||
|
self._assert_non_wav("flac", dtype, sample_rate, num_channels)
|
||||||
|
|
||||||
|
def assert_sphere(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save sph format."""
|
||||||
|
self._assert_non_wav("nist", dtype, sample_rate, num_channels)
|
||||||
|
|
||||||
|
def assert_ogg(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save ogg format.
|
||||||
|
|
||||||
|
As we cannot inspect the OGG format (it's lossy), we only check the metadata.
|
||||||
|
"""
|
||||||
|
self._assert_non_wav("ogg", dtype, sample_rate, num_channels)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSave(SaveTestBase):
|
||||||
|
@parameterize(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save wav format."""
|
||||||
|
self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["float32", "int32"],
|
||||||
|
[4, 8, 16, 32], )
|
||||||
|
def test_multiple_channels(self, dtype, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save wav with more than 2 channels."""
|
||||||
|
sample_rate = 8000
|
||||||
|
self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
["int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )
|
||||||
|
@skipIfFormatNotSupported("NIST")
|
||||||
|
def test_sphere(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save sph format."""
|
||||||
|
self.assert_sphere(dtype, sample_rate, num_channels)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )
|
||||||
|
@skipIfFormatNotSupported("FLAC")
|
||||||
|
def test_flac(self, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save flac format."""
|
||||||
|
self.assert_flac("float32", sample_rate, num_channels)
|
||||||
|
|
||||||
|
@parameterize(
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )
|
||||||
|
@skipIfFormatNotSupported("OGG")
|
||||||
|
def test_ogg(self, sample_rate, num_channels):
|
||||||
|
"""`soundfile_backend.save` can save ogg/vorbis format."""
|
||||||
|
self.assert_ogg("float32", sample_rate, num_channels)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSaveParams(TempDirMixin, unittest.TestCase):
|
||||||
|
"""Test the correctness of optional parameters of `soundfile_backend.save`"""
|
||||||
|
|
||||||
|
@parameterize([True, False])
|
||||||
|
def test_channels_first(self, channels_first):
|
||||||
|
"""channels_first swaps axes"""
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
data = get_wav_data("int32", 2, channels_first=channels_first)
|
||||||
|
soundfile_backend.save(path, data, 8000, channels_first=channels_first)
|
||||||
|
found = load_wav(path)[0]
|
||||||
|
expected = data if channels_first else data.transpose([1, 0])
|
||||||
|
#self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileObject(TempDirMixin, unittest.TestCase):
|
||||||
|
def _test_fileobj(self, ext):
|
||||||
|
"""Saving audio to file-like object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
path = self.get_temp_path(f"test.{ext}")
|
||||||
|
|
||||||
|
subtype = "FLOAT" if ext == "wav" else None
|
||||||
|
data = get_wav_data("float32", num_channels=2)
|
||||||
|
soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype)
|
||||||
|
expected = soundfile.read(path, dtype="float32")[0]
|
||||||
|
|
||||||
|
fileobj = io.BytesIO()
|
||||||
|
soundfile_backend.save(fileobj, data, sample_rate, format=ext)
|
||||||
|
fileobj.seek(0)
|
||||||
|
found, sr = soundfile.read(fileobj, dtype="float32")
|
||||||
|
|
||||||
|
assert sr == sample_rate
|
||||||
|
#self.assertEqual(expected, found, atol=1e-4, rtol=1e-8)
|
||||||
|
np.testing.assert_array_almost_equal(found, expected)
|
||||||
|
|
||||||
|
def test_fileobj_wav(self):
|
||||||
|
"""Saving audio via file-like object works"""
|
||||||
|
self._test_fileobj("wav")
|
||||||
|
|
||||||
|
@skipIfFormatNotSupported("FLAC")
|
||||||
|
def test_fileobj_flac(self):
|
||||||
|
"""Saving audio via file-like object works"""
|
||||||
|
self._test_fileobj("flac")
|
||||||
|
|
||||||
|
@skipIfFormatNotSupported("NIST")
|
||||||
|
def test_fileobj_nist(self):
|
||||||
|
"""Saving audio via file-like object works"""
|
||||||
|
self._test_fileobj("NIST")
|
||||||
|
|
||||||
|
@skipIfFormatNotSupported("OGG")
|
||||||
|
def test_fileobj_ogg(self):
|
||||||
|
"""Saving audio via file-like object works"""
|
||||||
|
self._test_fileobj("OGG")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,89 @@
|
|||||||
|
import itertools
|
||||||
|
from unittest import skipIf
|
||||||
|
|
||||||
|
from paddleaudio._internal.module_utils import is_module_available
|
||||||
|
from parameterized import parameterized
|
||||||
|
|
||||||
|
|
||||||
|
def name_func(func, _, params):
|
||||||
|
return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
|
||||||
|
|
||||||
|
|
||||||
|
def dtype2subtype(dtype):
|
||||||
|
return {
|
||||||
|
"float64": "DOUBLE",
|
||||||
|
"float32": "FLOAT",
|
||||||
|
"int32": "PCM_32",
|
||||||
|
"int16": "PCM_16",
|
||||||
|
"uint8": "PCM_U8",
|
||||||
|
"int8": "PCM_S8",
|
||||||
|
}[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def skipIfFormatNotSupported(fmt):
|
||||||
|
fmts = []
|
||||||
|
if is_module_available("soundfile"):
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
fmts = soundfile.available_formats()
|
||||||
|
return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
|
||||||
|
return skipIf(True, '"soundfile" not available.')
|
||||||
|
|
||||||
|
|
||||||
|
def parameterize(*params):
|
||||||
|
return parameterized.expand(
|
||||||
|
list(itertools.product(*params)), name_func=name_func)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_wav_subtype(dtype, encoding, bits_per_sample):
|
||||||
|
subtype = {
|
||||||
|
(None, None): dtype2subtype(dtype),
|
||||||
|
(None, 8): "PCM_U8",
|
||||||
|
("PCM_U", None): "PCM_U8",
|
||||||
|
("PCM_U", 8): "PCM_U8",
|
||||||
|
("PCM_S", None): "PCM_32",
|
||||||
|
("PCM_S", 16): "PCM_16",
|
||||||
|
("PCM_S", 32): "PCM_32",
|
||||||
|
("PCM_F", None): "FLOAT",
|
||||||
|
("PCM_F", 32): "FLOAT",
|
||||||
|
("PCM_F", 64): "DOUBLE",
|
||||||
|
("ULAW", None): "ULAW",
|
||||||
|
("ULAW", 8): "ULAW",
|
||||||
|
("ALAW", None): "ALAW",
|
||||||
|
("ALAW", 8): "ALAW",
|
||||||
|
}.get((encoding, bits_per_sample))
|
||||||
|
if subtype:
|
||||||
|
return subtype
|
||||||
|
raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
|
||||||
|
|
||||||
|
def get_encoding(ext, dtype):
|
||||||
|
exts = {
|
||||||
|
"mp3",
|
||||||
|
"flac",
|
||||||
|
"vorbis",
|
||||||
|
}
|
||||||
|
encodings = {
|
||||||
|
"float32": "PCM_F",
|
||||||
|
"int32": "PCM_S",
|
||||||
|
"int16": "PCM_S",
|
||||||
|
"uint8": "PCM_U",
|
||||||
|
}
|
||||||
|
return ext.upper() if ext in exts else encodings[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bit_depth(dtype):
|
||||||
|
bit_depths = {
|
||||||
|
"float32": 32,
|
||||||
|
"int32": 32,
|
||||||
|
"int16": 16,
|
||||||
|
"uint8": 8,
|
||||||
|
}
|
||||||
|
return bit_depths[dtype]
|
||||||
|
|
||||||
|
def get_bits_per_sample(ext, dtype):
|
||||||
|
bits_per_samples = {
|
||||||
|
"flac": 24,
|
||||||
|
"mp3": 0,
|
||||||
|
"vorbis": 0,
|
||||||
|
}
|
||||||
|
return bits_per_samples.get(ext, get_bit_depth(dtype))
|
@ -0,0 +1 @@
|
|||||||
|
../../common_utils
|
@ -0,0 +1,322 @@
|
|||||||
|
import io
|
||||||
|
import itertools
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import tarfile
|
||||||
|
import unittest
|
||||||
|
from contextlib import contextmanager
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
import warnings
|
||||||
|
warnings.warn("sox io not support in Windows, please skip test.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
from parameterized import parameterized
|
||||||
|
from common import get_bits_per_sample, get_encoding
|
||||||
|
|
||||||
|
from paddleaudio.backends import sox_io_backend
|
||||||
|
|
||||||
|
from common_utils import (
|
||||||
|
get_wav_data,
|
||||||
|
save_wav,
|
||||||
|
TempDirMixin,
|
||||||
|
sox_utils, )
|
||||||
|
|
||||||
|
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/info_test.py
|
||||||
|
|
||||||
|
|
||||||
|
class TestInfo(TempDirMixin, unittest.TestCase):
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(
|
||||||
|
[
|
||||||
|
"float32",
|
||||||
|
"int32",
|
||||||
|
],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )), )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`sox_io_backend.info` can check wav file correctly"""
|
||||||
|
duration = 1
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
data = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
normalize=False,
|
||||||
|
num_frames=duration * sample_rate)
|
||||||
|
save_wav(path, data, sample_rate)
|
||||||
|
info = sox_io_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_rate * duration
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
|
||||||
|
assert info.encoding == get_encoding("wav", dtype)
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[4, 8, 16, 32], )), )
|
||||||
|
def test_wav_multiple_channels(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`sox_io_backend.info` can check wav file with channels more than 2 correctly"""
|
||||||
|
duration = 1
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
data = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
normalize=False,
|
||||||
|
num_frames=duration * sample_rate)
|
||||||
|
save_wav(path, data, sample_rate)
|
||||||
|
info = sox_io_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_rate * duration
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
|
||||||
|
|
||||||
|
def test_ulaw(self):
|
||||||
|
"""`sox_io_backend.info` can check ulaw file correctly"""
|
||||||
|
duration = 1
|
||||||
|
num_channels = 1
|
||||||
|
sample_rate = 8000
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
sox_utils.gen_audio_file(
|
||||||
|
path,
|
||||||
|
sample_rate=sample_rate,
|
||||||
|
num_channels=num_channels,
|
||||||
|
bit_depth=8,
|
||||||
|
encoding="u-law",
|
||||||
|
duration=duration)
|
||||||
|
info = sox_io_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_rate * duration
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == 8
|
||||||
|
assert info.encoding == "ULAW"
|
||||||
|
|
||||||
|
def test_alaw(self):
|
||||||
|
"""`sox_io_backend.info` can check alaw file correctly"""
|
||||||
|
duration = 1
|
||||||
|
num_channels = 1
|
||||||
|
sample_rate = 8000
|
||||||
|
path = self.get_temp_path("data.wav")
|
||||||
|
sox_utils.gen_audio_file(
|
||||||
|
path,
|
||||||
|
sample_rate=sample_rate,
|
||||||
|
num_channels=num_channels,
|
||||||
|
bit_depth=8,
|
||||||
|
encoding="a-law",
|
||||||
|
duration=duration)
|
||||||
|
info = sox_io_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_rate * duration
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == 8
|
||||||
|
assert info.encoding == "ALAW"
|
||||||
|
|
||||||
|
|
||||||
|
#class TestInfoOpus(unittest.TestCase):
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#["96k"],
|
||||||
|
#[1, 2],
|
||||||
|
#[0, 5, 10],
|
||||||
|
#)
|
||||||
|
#),
|
||||||
|
#)
|
||||||
|
#def test_opus(self, bitrate, num_channels, compression_level):
|
||||||
|
#"""`sox_io_backend.info` can check opus file correcty"""
|
||||||
|
#path = data_utils.get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus")
|
||||||
|
#info = sox_io_backend.info(path)
|
||||||
|
#assert info.sample_rate == 48000
|
||||||
|
#assert info.num_frames == 32768
|
||||||
|
#assert info.num_channels == num_channels
|
||||||
|
#assert info.bits_per_sample == 0 # bit_per_sample is irrelevant for compressed formats
|
||||||
|
#assert info.encoding == "OPUS"
|
||||||
|
|
||||||
|
|
||||||
|
class FileObjTestBase(TempDirMixin):
|
||||||
|
def _gen_file(self,
|
||||||
|
ext,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
num_frames,
|
||||||
|
*,
|
||||||
|
comments=None):
|
||||||
|
path = self.get_temp_path(f"test.{ext}")
|
||||||
|
bit_depth = sox_utils.get_bit_depth(dtype)
|
||||||
|
duration = num_frames / sample_rate
|
||||||
|
comment_file = self._gen_comment_file(comments) if comments else None
|
||||||
|
|
||||||
|
sox_utils.gen_audio_file(
|
||||||
|
path,
|
||||||
|
sample_rate,
|
||||||
|
num_channels=num_channels,
|
||||||
|
encoding=sox_utils.get_encoding(dtype),
|
||||||
|
bit_depth=bit_depth,
|
||||||
|
duration=duration,
|
||||||
|
comment_file=comment_file, )
|
||||||
|
return path
|
||||||
|
|
||||||
|
def _gen_comment_file(self, comments):
|
||||||
|
comment_path = self.get_temp_path("comment.txt")
|
||||||
|
with open(comment_path, "w") as file_:
|
||||||
|
file_.writelines(comments)
|
||||||
|
return comment_path
|
||||||
|
|
||||||
|
|
||||||
|
class Unseekable:
|
||||||
|
def __init__(self, fileobj):
|
||||||
|
self.fileobj = fileobj
|
||||||
|
|
||||||
|
def read(self, n):
|
||||||
|
return self.fileobj.read(n)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileObject(FileObjTestBase, unittest.TestCase):
|
||||||
|
def _query_fileobj(self,
|
||||||
|
ext,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
num_frames,
|
||||||
|
*,
|
||||||
|
comments=None):
|
||||||
|
path = self._gen_file(
|
||||||
|
ext,
|
||||||
|
dtype,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
num_frames,
|
||||||
|
comments=comments)
|
||||||
|
format_ = ext if ext in ["mp3"] else None
|
||||||
|
with open(path, "rb") as fileobj:
|
||||||
|
return sox_io_backend.info(fileobj, format_)
|
||||||
|
|
||||||
|
def _query_bytesio(self, ext, dtype, sample_rate, num_channels, num_frames):
|
||||||
|
path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
|
||||||
|
format_ = ext if ext in ["mp3"] else None
|
||||||
|
with open(path, "rb") as file_:
|
||||||
|
fileobj = io.BytesIO(file_.read())
|
||||||
|
return sox_io_backend.info(fileobj, format_)
|
||||||
|
|
||||||
|
def _query_tarfile(self, ext, dtype, sample_rate, num_channels, num_frames):
|
||||||
|
audio_path = self._gen_file(ext, dtype, sample_rate, num_channels,
|
||||||
|
num_frames)
|
||||||
|
audio_file = os.path.basename(audio_path)
|
||||||
|
archive_path = self.get_temp_path("archive.tar.gz")
|
||||||
|
with tarfile.TarFile(archive_path, "w") as tarobj:
|
||||||
|
tarobj.add(audio_path, arcname=audio_file)
|
||||||
|
format_ = ext if ext in ["mp3"] else None
|
||||||
|
with tarfile.TarFile(archive_path, "r") as tarobj:
|
||||||
|
fileobj = tarobj.extractfile(audio_file)
|
||||||
|
return sox_io_backend.info(fileobj, format_)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _set_buffer_size(self, buffer_size):
|
||||||
|
try:
|
||||||
|
original_buffer_size = get_buffer_size()
|
||||||
|
set_buffer_size(buffer_size)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
set_buffer_size(original_buffer_size)
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", "float32"),
|
||||||
|
("wav", "int32"),
|
||||||
|
("wav", "int16"),
|
||||||
|
("wav", "uint8"),
|
||||||
|
])
|
||||||
|
def test_fileobj(self, ext, dtype):
|
||||||
|
"""Querying audio via file object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
num_frames = 3 * sample_rate
|
||||||
|
num_channels = 2
|
||||||
|
sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels,
|
||||||
|
num_frames)
|
||||||
|
|
||||||
|
bits_per_sample = get_bits_per_sample(ext, dtype)
|
||||||
|
num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
|
||||||
|
|
||||||
|
assert sinfo.sample_rate == sample_rate
|
||||||
|
assert sinfo.num_channels == num_channels
|
||||||
|
assert sinfo.num_frames == num_frames
|
||||||
|
assert sinfo.bits_per_sample == bits_per_sample
|
||||||
|
assert sinfo.encoding == get_encoding(ext, dtype)
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", "float32"),
|
||||||
|
("wav", "int32"),
|
||||||
|
("wav", "int16"),
|
||||||
|
("wav", "uint8"),
|
||||||
|
])
|
||||||
|
def test_bytesio(self, ext, dtype):
|
||||||
|
"""Querying audio via ByteIO object works for small data"""
|
||||||
|
sample_rate = 16000
|
||||||
|
num_frames = 3 * sample_rate
|
||||||
|
num_channels = 2
|
||||||
|
sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels,
|
||||||
|
num_frames)
|
||||||
|
|
||||||
|
bits_per_sample = get_bits_per_sample(ext, dtype)
|
||||||
|
num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
|
||||||
|
|
||||||
|
assert sinfo.sample_rate == sample_rate
|
||||||
|
assert sinfo.num_channels == num_channels
|
||||||
|
assert sinfo.num_frames == num_frames
|
||||||
|
assert sinfo.bits_per_sample == bits_per_sample
|
||||||
|
assert sinfo.encoding == get_encoding(ext, dtype)
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", "float32"),
|
||||||
|
("wav", "int32"),
|
||||||
|
("wav", "int16"),
|
||||||
|
("wav", "uint8"),
|
||||||
|
])
|
||||||
|
def test_bytesio_tiny(self, ext, dtype):
|
||||||
|
"""Querying audio via ByteIO object works for small data"""
|
||||||
|
sample_rate = 8000
|
||||||
|
num_frames = 4
|
||||||
|
num_channels = 2
|
||||||
|
sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels,
|
||||||
|
num_frames)
|
||||||
|
|
||||||
|
bits_per_sample = get_bits_per_sample(ext, dtype)
|
||||||
|
num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
|
||||||
|
|
||||||
|
assert sinfo.sample_rate == sample_rate
|
||||||
|
assert sinfo.num_channels == num_channels
|
||||||
|
assert sinfo.num_frames == num_frames
|
||||||
|
assert sinfo.bits_per_sample == bits_per_sample
|
||||||
|
assert sinfo.encoding == get_encoding(ext, dtype)
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", "float32"),
|
||||||
|
("wav", "int32"),
|
||||||
|
("wav", "int16"),
|
||||||
|
("wav", "uint8"),
|
||||||
|
("flac", "float32"),
|
||||||
|
("vorbis", "float32"),
|
||||||
|
("amb", "int16"),
|
||||||
|
])
|
||||||
|
def test_tarfile(self, ext, dtype):
|
||||||
|
"""Querying compressed audio via file-like object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
num_frames = 3.0 * sample_rate
|
||||||
|
num_channels = 2
|
||||||
|
sinfo = self._query_tarfile(ext, dtype, sample_rate, num_channels,
|
||||||
|
num_frames)
|
||||||
|
|
||||||
|
bits_per_sample = get_bits_per_sample(ext, dtype)
|
||||||
|
num_frames = 0 if ext in ["vorbis"] else num_frames
|
||||||
|
|
||||||
|
assert sinfo.sample_rate == sample_rate
|
||||||
|
assert sinfo.num_channels == num_channels
|
||||||
|
assert sinfo.num_frames == num_frames
|
||||||
|
assert sinfo.bits_per_sample == bits_per_sample
|
||||||
|
assert sinfo.encoding == get_encoding(ext, dtype)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,56 @@
|
|||||||
|
import itertools
|
||||||
|
import platform
|
||||||
|
import unittest
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
import warnings
|
||||||
|
warnings.warn("sox io not support in Windows, please skip test.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
from parameterized import parameterized
|
||||||
|
import numpy as np
|
||||||
|
from paddleaudio.backends import sox_io_backend
|
||||||
|
|
||||||
|
from common_utils import (
|
||||||
|
get_wav_data,
|
||||||
|
load_wav,
|
||||||
|
save_wav, )
|
||||||
|
|
||||||
|
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/load_test.py
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoad(unittest.TestCase):
|
||||||
|
def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
|
||||||
|
"""`sox_io_backend.load` can load wav format correctly.
|
||||||
|
|
||||||
|
Wav data loaded with sox_io backend should match those with scipy
|
||||||
|
"""
|
||||||
|
path = 'testdata/reference.wav'
|
||||||
|
data = get_wav_data(
|
||||||
|
dtype,
|
||||||
|
num_channels,
|
||||||
|
normalize=normalize,
|
||||||
|
num_frames=duration * sample_rate)
|
||||||
|
save_wav(path, data, sample_rate)
|
||||||
|
expected = load_wav(path, normalize=normalize)[0]
|
||||||
|
data, sr = sox_io_backend.load(path, normalize=normalize)
|
||||||
|
assert sr == sample_rate
|
||||||
|
np.testing.assert_array_almost_equal(data, expected, decimal=4)
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(
|
||||||
|
[
|
||||||
|
"float64",
|
||||||
|
"float32",
|
||||||
|
"int32",
|
||||||
|
],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True], )), )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, normalize):
|
||||||
|
"""`sox_io_backend.load` can load wav format correctly."""
|
||||||
|
self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,188 @@
|
|||||||
|
import io
|
||||||
|
import platform
|
||||||
|
import unittest
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
import warnings
|
||||||
|
warnings.warn("sox io not support in Windows, please skip test.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from paddleaudio.backends import sox_io_backend
|
||||||
|
|
||||||
|
from common_utils import (get_wav_data, load_wav, save_wav, nested_params,
|
||||||
|
TempDirMixin, sox_utils)
|
||||||
|
|
||||||
|
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/save_test.py
|
||||||
|
|
||||||
|
|
||||||
|
def _get_sox_encoding(encoding):
|
||||||
|
encodings = {
|
||||||
|
"PCM_F": "floating-point",
|
||||||
|
"PCM_S": "signed-integer",
|
||||||
|
"PCM_U": "unsigned-integer",
|
||||||
|
"ULAW": "u-law",
|
||||||
|
"ALAW": "a-law",
|
||||||
|
}
|
||||||
|
return encodings.get(encoding)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSaveBase(TempDirMixin):
|
||||||
|
def assert_save_consistency(
|
||||||
|
self,
|
||||||
|
format: str,
|
||||||
|
*,
|
||||||
|
compression: float=None,
|
||||||
|
encoding: str=None,
|
||||||
|
bits_per_sample: int=None,
|
||||||
|
sample_rate: float=8000,
|
||||||
|
num_channels: int=2,
|
||||||
|
num_frames: float=3 * 8000,
|
||||||
|
src_dtype: str="int32",
|
||||||
|
test_mode: str="path", ):
|
||||||
|
"""`save` function produces file that is comparable with `sox` command
|
||||||
|
|
||||||
|
To compare that the file produced by `save` function agains the file produced by
|
||||||
|
the equivalent `sox` command, we need to load both files.
|
||||||
|
But there are many formats that cannot be opened with common Python modules (like
|
||||||
|
SciPy).
|
||||||
|
So we use `sox` command to prepare the original data and convert the saved files
|
||||||
|
into a format that SciPy can read (PCM wav).
|
||||||
|
The following diagram illustrates this process. The difference is 2.1. and 3.1.
|
||||||
|
|
||||||
|
This assumes that
|
||||||
|
- loading data with SciPy preserves the data well.
|
||||||
|
- converting the resulting files into WAV format with `sox` preserve the data well.
|
||||||
|
|
||||||
|
x
|
||||||
|
| 1. Generate source wav file with SciPy
|
||||||
|
|
|
||||||
|
v
|
||||||
|
-------------- wav ----------------
|
||||||
|
| |
|
||||||
|
| 2.1. load with scipy | 3.1. Convert to the target
|
||||||
|
| then save it into the target | format depth with sox
|
||||||
|
| format with paddleaudio |
|
||||||
|
v v
|
||||||
|
target format target format
|
||||||
|
| |
|
||||||
|
| 2.2. Convert to wav with sox | 3.2. Convert to wav with sox
|
||||||
|
| |
|
||||||
|
v v
|
||||||
|
wav wav
|
||||||
|
| |
|
||||||
|
| 2.3. load with scipy | 3.3. load with scipy
|
||||||
|
| |
|
||||||
|
v v
|
||||||
|
tensor -------> compare <--------- tensor
|
||||||
|
|
||||||
|
"""
|
||||||
|
cmp_encoding = "floating-point"
|
||||||
|
cmp_bit_depth = 32
|
||||||
|
|
||||||
|
src_path = self.get_temp_path("1.source.wav")
|
||||||
|
tgt_path = self.get_temp_path(f"2.1.paddleaudio.{format}")
|
||||||
|
tst_path = self.get_temp_path("2.2.result.wav")
|
||||||
|
sox_path = self.get_temp_path(f"3.1.sox.{format}")
|
||||||
|
ref_path = self.get_temp_path("3.2.ref.wav")
|
||||||
|
|
||||||
|
# 1. Generate original wav
|
||||||
|
data = get_wav_data(
|
||||||
|
src_dtype, num_channels, normalize=False, num_frames=num_frames)
|
||||||
|
save_wav(src_path, data, sample_rate)
|
||||||
|
|
||||||
|
# 2.1. Convert the original wav to target format with paddleaudio
|
||||||
|
data = load_wav(src_path, normalize=False)[0]
|
||||||
|
if test_mode == "path":
|
||||||
|
sox_io_backend.save(
|
||||||
|
tgt_path,
|
||||||
|
data,
|
||||||
|
sample_rate,
|
||||||
|
compression=compression,
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample)
|
||||||
|
elif test_mode == "fileobj":
|
||||||
|
with open(tgt_path, "bw") as file_:
|
||||||
|
sox_io_backend.save(
|
||||||
|
file_,
|
||||||
|
data,
|
||||||
|
sample_rate,
|
||||||
|
format=format,
|
||||||
|
compression=compression,
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample, )
|
||||||
|
elif test_mode == "bytesio":
|
||||||
|
file_ = io.BytesIO()
|
||||||
|
sox_io_backend.save(
|
||||||
|
file_,
|
||||||
|
data,
|
||||||
|
sample_rate,
|
||||||
|
format=format,
|
||||||
|
compression=compression,
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample, )
|
||||||
|
file_.seek(0)
|
||||||
|
with open(tgt_path, "bw") as f:
|
||||||
|
f.write(file_.read())
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected test mode: {test_mode}")
|
||||||
|
# 2.2. Convert the target format to wav with sox
|
||||||
|
sox_utils.convert_audio_file(
|
||||||
|
tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
|
||||||
|
# 2.3. Load with SciPy
|
||||||
|
found = load_wav(tst_path, normalize=False)[0]
|
||||||
|
|
||||||
|
# 3.1. Convert the original wav to target format with sox
|
||||||
|
sox_encoding = _get_sox_encoding(encoding)
|
||||||
|
sox_utils.convert_audio_file(
|
||||||
|
src_path,
|
||||||
|
sox_path,
|
||||||
|
compression=compression,
|
||||||
|
encoding=sox_encoding,
|
||||||
|
bit_depth=bits_per_sample)
|
||||||
|
# 3.2. Convert the target format to wav with sox
|
||||||
|
sox_utils.convert_audio_file(
|
||||||
|
sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
|
||||||
|
# 3.3. Load with SciPy
|
||||||
|
expected = load_wav(ref_path, normalize=False)[0]
|
||||||
|
|
||||||
|
np.testing.assert_array_almost_equal(found, expected)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSave(TestSaveBase, unittest.TestCase):
|
||||||
|
@nested_params(
|
||||||
|
[
|
||||||
|
"path",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
("PCM_U", 8),
|
||||||
|
("PCM_S", 16),
|
||||||
|
("PCM_S", 32),
|
||||||
|
("PCM_F", 32),
|
||||||
|
("PCM_F", 64),
|
||||||
|
("ULAW", 8),
|
||||||
|
("ALAW", 8),
|
||||||
|
], )
|
||||||
|
def test_save_wav(self, test_mode, enc_params):
|
||||||
|
encoding, bits_per_sample = enc_params
|
||||||
|
self.assert_save_consistency(
|
||||||
|
"wav",
|
||||||
|
encoding=encoding,
|
||||||
|
bits_per_sample=bits_per_sample,
|
||||||
|
test_mode=test_mode)
|
||||||
|
|
||||||
|
@nested_params(
|
||||||
|
[
|
||||||
|
"path",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
("float32", ),
|
||||||
|
("int32", ),
|
||||||
|
], )
|
||||||
|
def test_save_wav_dtype(self, test_mode, params):
|
||||||
|
(dtype, ) = params
|
||||||
|
self.assert_save_consistency(
|
||||||
|
"wav", src_dtype=dtype, test_mode=test_mode)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,189 @@
|
|||||||
|
import io
|
||||||
|
import itertools
|
||||||
|
import platform
|
||||||
|
import unittest
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
import warnings
|
||||||
|
warnings.warn("sox io not support in Windows, please skip test.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
from parameterized import parameterized
|
||||||
|
from paddleaudio.backends import sox_io_backend
|
||||||
|
from common_utils import (get_wav_data, TempDirMixin, name_func)
|
||||||
|
|
||||||
|
|
||||||
|
class SmokeTest(TempDirMixin, unittest.TestCase):
|
||||||
|
"""Run smoke test on various audio format
|
||||||
|
|
||||||
|
The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
|
||||||
|
abnormal behaviors.
|
||||||
|
|
||||||
|
This test suite should be able to run without any additional tools (such as sox command),
|
||||||
|
however without such tools, the correctness of each function cannot be verified.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def run_smoke_test(self,
|
||||||
|
ext,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
*,
|
||||||
|
compression=None,
|
||||||
|
dtype="float32"):
|
||||||
|
duration = 1
|
||||||
|
num_frames = sample_rate * duration
|
||||||
|
#path = self.get_temp_path(f"test.{ext}")
|
||||||
|
path = self.get_temp_path(f"test.{ext}")
|
||||||
|
original = get_wav_data(
|
||||||
|
dtype, num_channels, normalize=False, num_frames=num_frames)
|
||||||
|
|
||||||
|
# 1. run save
|
||||||
|
sox_io_backend.save(
|
||||||
|
path, original, sample_rate, compression=compression)
|
||||||
|
# 2. run info
|
||||||
|
info = sox_io_backend.info(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
# 3. run load
|
||||||
|
loaded, sr = sox_io_backend.load(path, normalize=False)
|
||||||
|
assert sr == sample_rate
|
||||||
|
assert loaded.shape[0] == num_channels
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(
|
||||||
|
["float32", "int32"],
|
||||||
|
#["float32", "int32", "int16", "uint8"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )),
|
||||||
|
name_func=name_func, )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels):
|
||||||
|
"""Run smoke test on wav format"""
|
||||||
|
self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
|
||||||
|
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#[8000, 16000],
|
||||||
|
#[1, 2],
|
||||||
|
#[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#def test_mp3(self, sample_rate, num_channels, bit_rate):
|
||||||
|
#"""Run smoke test on mp3 format"""
|
||||||
|
#self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
|
||||||
|
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#[8000, 16000],
|
||||||
|
#[1, 2],
|
||||||
|
#[-1, 0, 1, 2, 3, 3.6, 5, 10],
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#def test_vorbis(self, sample_rate, num_channels, quality_level):
|
||||||
|
#"""Run smoke test on vorbis format"""
|
||||||
|
#self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(itertools.product(
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
list(range(9)), )),
|
||||||
|
name_func=name_func, )
|
||||||
|
def test_flac(self, sample_rate, num_channels, compression_level):
|
||||||
|
"""Run smoke test on flac format"""
|
||||||
|
self.run_smoke_test(
|
||||||
|
"flac", sample_rate, num_channels, compression=compression_level)
|
||||||
|
|
||||||
|
|
||||||
|
class SmokeTestFileObj(unittest.TestCase):
|
||||||
|
"""Run smoke test on various audio format
|
||||||
|
|
||||||
|
The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
|
||||||
|
abnormal behaviors.
|
||||||
|
|
||||||
|
This test suite should be able to run without any additional tools (such as sox command),
|
||||||
|
however without such tools, the correctness of each function cannot be verified.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def run_smoke_test(self,
|
||||||
|
ext,
|
||||||
|
sample_rate,
|
||||||
|
num_channels,
|
||||||
|
*,
|
||||||
|
compression=None,
|
||||||
|
dtype="float32"):
|
||||||
|
duration = 1
|
||||||
|
num_frames = sample_rate * duration
|
||||||
|
original = get_wav_data(
|
||||||
|
dtype, num_channels, normalize=False, num_frames=num_frames)
|
||||||
|
|
||||||
|
fileobj = io.BytesIO()
|
||||||
|
# 1. run save
|
||||||
|
sox_io_backend.save(
|
||||||
|
fileobj, original, sample_rate, compression=compression, format=ext)
|
||||||
|
# 2. run info
|
||||||
|
fileobj.seek(0)
|
||||||
|
info = sox_io_backend.info(fileobj, format=ext)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
# 3. run load
|
||||||
|
fileobj.seek(0)
|
||||||
|
loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
|
||||||
|
assert sr == sample_rate
|
||||||
|
assert loaded.shape[0] == num_channels
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(itertools.product(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )),
|
||||||
|
name_func=name_func, )
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels):
|
||||||
|
"""Run smoke test on wav format"""
|
||||||
|
self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
|
||||||
|
|
||||||
|
# not support yet
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#[8000, 16000],
|
||||||
|
#[1, 2],
|
||||||
|
#[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#def test_mp3(self, sample_rate, num_channels, bit_rate):
|
||||||
|
#"""Run smoke test on mp3 format"""
|
||||||
|
#self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
|
||||||
|
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#[8000, 16000],
|
||||||
|
#[1, 2],
|
||||||
|
#[-1, 0, 1, 2, 3, 3.6, 5, 10],
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#)
|
||||||
|
#def test_vorbis(self, sample_rate, num_channels, quality_level):
|
||||||
|
#"""Run smoke test on vorbis format"""
|
||||||
|
#self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(itertools.product(
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
list(range(9)), )),
|
||||||
|
name_func=name_func, )
|
||||||
|
def test_flac(self, sample_rate, num_channels, compression_level):
|
||||||
|
#"""Run smoke test on flac format"""
|
||||||
|
self.run_smoke_test(
|
||||||
|
"flac", sample_rate, num_channels, compression=compression_level)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
#test_func()
|
||||||
|
unittest.main()
|
@ -0,0 +1,364 @@
|
|||||||
|
#code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/sox_effect/sox_effect_test.py
|
||||||
|
import io
|
||||||
|
import itertools
|
||||||
|
import platform
|
||||||
|
import tarfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
import warnings
|
||||||
|
warnings.warn("sox io not support in Windows, please skip test.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
from parameterized import parameterized
|
||||||
|
from paddleaudio import sox_effects
|
||||||
|
from common_utils import (get_sinusoid, get_wav_data, load_wav, save_wav,
|
||||||
|
sox_utils, TempDirMixin, load_effects_params)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSoxEffects(unittest.TestCase):
|
||||||
|
def test_init(self):
|
||||||
|
"""Calling init_sox_effects multiple times does not crush"""
|
||||||
|
for _ in range(3):
|
||||||
|
sox_effects.init_sox_effects()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSoxEffectsTensor(TempDirMixin, unittest.TestCase):
|
||||||
|
"""Test suite for `apply_effects_tensor` function"""
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(["float32", "int32"], [8000, 16000], [1, 2, 4, 8],
|
||||||
|
[True, False])), )
|
||||||
|
def test_apply_no_effect(self, dtype, sample_rate, num_channels,
|
||||||
|
channels_first):
|
||||||
|
"""`apply_effects_tensor` without effects should return identical data as input"""
|
||||||
|
original = get_wav_data(
|
||||||
|
dtype, num_channels, channels_first=channels_first)
|
||||||
|
expected = original.clone()
|
||||||
|
|
||||||
|
found, output_sample_rate = sox_effects.apply_effects_tensor(
|
||||||
|
expected, sample_rate, [], channels_first)
|
||||||
|
|
||||||
|
assert (output_sample_rate == sample_rate)
|
||||||
|
# SoxEffect should not alter the input Tensor object
|
||||||
|
#self.assertEqual(original, expected)
|
||||||
|
np.testing.assert_array_almost_equal(original.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
# SoxEffect should not return the same Tensor object
|
||||||
|
assert expected is not found
|
||||||
|
# Returned Tensor should equal to the input Tensor
|
||||||
|
#self.assertEqual(expected, found)
|
||||||
|
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
load_effects_params("sox_effect_test_args.jsonl"),
|
||||||
|
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
|
||||||
|
)
|
||||||
|
def test_apply_effects(self, args):
|
||||||
|
"""`apply_effects_tensor` should return identical data as sox command"""
|
||||||
|
effects = args["effects"]
|
||||||
|
num_channels = args.get("num_channels", 2)
|
||||||
|
input_sr = args.get("input_sample_rate", 8000)
|
||||||
|
output_sr = args.get("output_sample_rate")
|
||||||
|
|
||||||
|
input_path = self.get_temp_path("input.wav")
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
|
||||||
|
original = get_sinusoid(
|
||||||
|
frequency=800,
|
||||||
|
sample_rate=input_sr,
|
||||||
|
n_channels=num_channels,
|
||||||
|
dtype="float32")
|
||||||
|
save_wav(input_path, original, input_sr)
|
||||||
|
sox_utils.run_sox_effect(
|
||||||
|
input_path, reference_path, effects, output_sample_rate=output_sr)
|
||||||
|
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
found, sr = sox_effects.apply_effects_tensor(original, input_sr,
|
||||||
|
effects)
|
||||||
|
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(expected, found)
|
||||||
|
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
|
||||||
|
|
||||||
|
|
||||||
|
class TestSoxEffectsFile(TempDirMixin, unittest.TestCase):
|
||||||
|
"""Test suite for `apply_effects_file` function"""
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2, 4, 8],
|
||||||
|
[False, True], )),
|
||||||
|
#name_func=name_func,
|
||||||
|
)
|
||||||
|
def test_apply_no_effect(self, dtype, sample_rate, num_channels,
|
||||||
|
channels_first):
|
||||||
|
"""`apply_effects_file` without effects should return identical data as input"""
|
||||||
|
path = self.get_temp_path("input.wav")
|
||||||
|
expected = get_wav_data(
|
||||||
|
dtype, num_channels, channels_first=channels_first)
|
||||||
|
save_wav(path, expected, sample_rate, channels_first=channels_first)
|
||||||
|
|
||||||
|
found, output_sample_rate = sox_effects.apply_effects_file(
|
||||||
|
path, [], normalize=False, channels_first=channels_first)
|
||||||
|
|
||||||
|
assert output_sample_rate == sample_rate
|
||||||
|
#self.assertEqual(expected, found)
|
||||||
|
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
load_effects_params("sox_effect_test_args.jsonl"),
|
||||||
|
#name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
|
||||||
|
)
|
||||||
|
def test_apply_effects_str(self, args):
|
||||||
|
"""`apply_effects_file` should return identical data as sox command"""
|
||||||
|
dtype = "int32"
|
||||||
|
channels_first = True
|
||||||
|
effects = args["effects"]
|
||||||
|
num_channels = args.get("num_channels", 2)
|
||||||
|
input_sr = args.get("input_sample_rate", 8000)
|
||||||
|
output_sr = args.get("output_sample_rate")
|
||||||
|
|
||||||
|
input_path = self.get_temp_path("input.wav")
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
|
||||||
|
save_wav(input_path, data, input_sr, channels_first=channels_first)
|
||||||
|
sox_utils.run_sox_effect(
|
||||||
|
input_path, reference_path, effects, output_sample_rate=output_sr)
|
||||||
|
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
found, sr = sox_effects.apply_effects_file(
|
||||||
|
input_path, effects, normalize=False, channels_first=channels_first)
|
||||||
|
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
|
||||||
|
|
||||||
|
def test_apply_effects_path(self):
|
||||||
|
"""`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
|
||||||
|
dtype = "int32"
|
||||||
|
channels_first = True
|
||||||
|
effects = [["hilbert"]]
|
||||||
|
num_channels = 2
|
||||||
|
input_sr = 8000
|
||||||
|
output_sr = 8000
|
||||||
|
|
||||||
|
input_path = self.get_temp_path("input.wav")
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
|
||||||
|
save_wav(input_path, data, input_sr, channels_first=channels_first)
|
||||||
|
sox_utils.run_sox_effect(
|
||||||
|
input_path, reference_path, effects, output_sample_rate=output_sr)
|
||||||
|
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
found, sr = sox_effects.apply_effects_file(
|
||||||
|
Path(input_path),
|
||||||
|
effects,
|
||||||
|
normalize=False,
|
||||||
|
channels_first=channels_first)
|
||||||
|
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileFormats(TempDirMixin, unittest.TestCase):
|
||||||
|
"""`apply_effects_file` gives the same result as sox on various file formats"""
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(itertools.product(
|
||||||
|
["float32", "int32"],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2], )),
|
||||||
|
#name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
|
||||||
|
)
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels):
|
||||||
|
"""`apply_effects_file` works on various wav format"""
|
||||||
|
channels_first = True
|
||||||
|
effects = [["band", "300", "10"]]
|
||||||
|
|
||||||
|
input_path = self.get_temp_path("input.wav")
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
|
||||||
|
save_wav(input_path, data, sample_rate, channels_first=channels_first)
|
||||||
|
sox_utils.run_sox_effect(input_path, reference_path, effects)
|
||||||
|
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
found, sr = sox_effects.apply_effects_file(
|
||||||
|
input_path, effects, normalize=False, channels_first=channels_first)
|
||||||
|
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
#not support now
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#[8000, 16000],
|
||||||
|
#[1, 2],
|
||||||
|
#)
|
||||||
|
#),
|
||||||
|
##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
|
||||||
|
#)
|
||||||
|
#def test_flac(self, sample_rate, num_channels):
|
||||||
|
#"""`apply_effects_file` works on various flac format"""
|
||||||
|
#channels_first = True
|
||||||
|
#effects = [["band", "300", "10"]]
|
||||||
|
|
||||||
|
#input_path = self.get_temp_path("input.flac")
|
||||||
|
#reference_path = self.get_temp_path("reference.wav")
|
||||||
|
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
|
||||||
|
#sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
|
||||||
|
|
||||||
|
#expected, expected_sr = load_wav(reference_path)
|
||||||
|
#found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
|
||||||
|
#save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
|
||||||
|
|
||||||
|
#assert sr == expected_sr
|
||||||
|
##self.assertEqual(found, expected)
|
||||||
|
#np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
#@parameterized.expand(
|
||||||
|
#list(
|
||||||
|
#itertools.product(
|
||||||
|
#[8000, 16000],
|
||||||
|
#[1, 2],
|
||||||
|
#)
|
||||||
|
#),
|
||||||
|
##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
|
||||||
|
#)
|
||||||
|
#def test_vorbis(self, sample_rate, num_channels):
|
||||||
|
#"""`apply_effects_file` works on various vorbis format"""
|
||||||
|
#channels_first = True
|
||||||
|
#effects = [["band", "300", "10"]]
|
||||||
|
|
||||||
|
#input_path = self.get_temp_path("input.vorbis")
|
||||||
|
#reference_path = self.get_temp_path("reference.wav")
|
||||||
|
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
|
||||||
|
#sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
|
||||||
|
|
||||||
|
#expected, expected_sr = load_wav(reference_path)
|
||||||
|
#found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
|
||||||
|
#save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
|
||||||
|
|
||||||
|
#assert sr == expected_sr
|
||||||
|
##self.assertEqual(found, expected)
|
||||||
|
#np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
|
||||||
|
#@skipIfNoExec("sox")
|
||||||
|
#@skipIfNoSox
|
||||||
|
class TestFileObject(TempDirMixin, unittest.TestCase):
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", None),
|
||||||
|
])
|
||||||
|
def test_fileobj(self, ext, compression):
|
||||||
|
"""Applying effects via file object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
channels_first = True
|
||||||
|
effects = [["band", "300", "10"]]
|
||||||
|
input_path = self.get_temp_path(f"input.{ext}")
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
|
||||||
|
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
|
||||||
|
data = get_wav_data("int32", 2, channels_first=channels_first)
|
||||||
|
save_wav(input_path, data, sample_rate, channels_first=channels_first)
|
||||||
|
|
||||||
|
sox_utils.run_sox_effect(
|
||||||
|
input_path, reference_path, effects, output_bitdepth=32)
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
|
||||||
|
with open(input_path, "rb") as fileobj:
|
||||||
|
found, sr = sox_effects.apply_effects_file(
|
||||||
|
fileobj, effects, channels_first=channels_first)
|
||||||
|
save_wav(
|
||||||
|
self.get_temp_path("result.wav"),
|
||||||
|
found,
|
||||||
|
sr,
|
||||||
|
channels_first=channels_first)
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", None),
|
||||||
|
])
|
||||||
|
def test_bytesio(self, ext, compression):
|
||||||
|
"""Applying effects via BytesIO object works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
channels_first = True
|
||||||
|
effects = [["band", "300", "10"]]
|
||||||
|
input_path = self.get_temp_path(f"input.{ext}")
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
|
||||||
|
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
|
||||||
|
data = get_wav_data("int32", 2, channels_first=channels_first)
|
||||||
|
save_wav(input_path, data, sample_rate, channels_first=channels_first)
|
||||||
|
sox_utils.run_sox_effect(
|
||||||
|
input_path, reference_path, effects, output_bitdepth=32)
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
|
||||||
|
with open(input_path, "rb") as file_:
|
||||||
|
fileobj = io.BytesIO(file_.read())
|
||||||
|
found, sr = sox_effects.apply_effects_file(
|
||||||
|
fileobj, effects, channels_first=channels_first)
|
||||||
|
save_wav(
|
||||||
|
self.get_temp_path("result.wav"),
|
||||||
|
found,
|
||||||
|
sr,
|
||||||
|
channels_first=channels_first)
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
print("found")
|
||||||
|
print(found)
|
||||||
|
print("expected")
|
||||||
|
print(expected)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
@parameterized.expand([
|
||||||
|
("wav", None),
|
||||||
|
])
|
||||||
|
def test_tarfile(self, ext, compression):
|
||||||
|
"""Applying effects to compressed audio via file-like file works"""
|
||||||
|
sample_rate = 16000
|
||||||
|
channels_first = True
|
||||||
|
effects = [["band", "300", "10"]]
|
||||||
|
audio_file = f"input.{ext}"
|
||||||
|
|
||||||
|
input_path = self.get_temp_path(audio_file)
|
||||||
|
reference_path = self.get_temp_path("reference.wav")
|
||||||
|
archive_path = self.get_temp_path("archive.tar.gz")
|
||||||
|
data = get_wav_data("int32", 2, channels_first=channels_first)
|
||||||
|
save_wav(input_path, data, sample_rate, channels_first=channels_first)
|
||||||
|
|
||||||
|
# sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
|
||||||
|
sox_utils.run_sox_effect(
|
||||||
|
input_path, reference_path, effects, output_bitdepth=32)
|
||||||
|
|
||||||
|
expected, expected_sr = load_wav(reference_path)
|
||||||
|
|
||||||
|
with tarfile.TarFile(archive_path, "w") as tarobj:
|
||||||
|
tarobj.add(input_path, arcname=audio_file)
|
||||||
|
with tarfile.TarFile(archive_path, "r") as tarobj:
|
||||||
|
fileobj = tarobj.extractfile(audio_file)
|
||||||
|
found, sr = sox_effects.apply_effects_file(
|
||||||
|
fileobj, effects, channels_first=channels_first)
|
||||||
|
save_wav(
|
||||||
|
self.get_temp_path("result.wav"),
|
||||||
|
found,
|
||||||
|
sr,
|
||||||
|
channels_first=channels_first)
|
||||||
|
assert sr == expected_sr
|
||||||
|
#self.assertEqual(found, expected)
|
||||||
|
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,77 @@
|
|||||||
|
{"effects": [["allpass", "300", "10"]]}
|
||||||
|
{"effects": [["band", "300", "10"]]}
|
||||||
|
{"effects": [["bandpass", "300", "10"]]}
|
||||||
|
{"effects": [["bandreject", "300", "10"]]}
|
||||||
|
{"effects": [["bass", "-10"]]}
|
||||||
|
{"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]}
|
||||||
|
{"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]}
|
||||||
|
{"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]}
|
||||||
|
{"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]}
|
||||||
|
{"effects": [["channels", "1"]]}
|
||||||
|
{"effects": [["channels", "2"]]}
|
||||||
|
{"effects": [["channels", "3"]]}
|
||||||
|
{"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]}
|
||||||
|
{"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]}
|
||||||
|
{"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]}
|
||||||
|
{"effects": [["contrast", "0"]]}
|
||||||
|
{"effects": [["contrast", "25"]]}
|
||||||
|
{"effects": [["contrast", "50"]]}
|
||||||
|
{"effects": [["contrast", "75"]]}
|
||||||
|
{"effects": [["contrast", "100"]]}
|
||||||
|
{"effects": [["dcshift", "1.0"]]}
|
||||||
|
{"effects": [["dcshift", "-1.0"]]}
|
||||||
|
{"effects": [["deemph"]], "input_sample_rate": 44100}
|
||||||
|
{"effects": [["dither", "-s"]]}
|
||||||
|
{"effects": [["dither", "-S"]]}
|
||||||
|
{"effects": [["divide"]]}
|
||||||
|
{"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000}
|
||||||
|
{"effects": [["earwax"]], "input_sample_rate": 44100}
|
||||||
|
{"effects": [["echo", "0.8", "0.88", "60", "0.4"]]}
|
||||||
|
{"effects": [["echo", "0.8", "0.88", "6", "0.4"]]}
|
||||||
|
{"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]}
|
||||||
|
{"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]}
|
||||||
|
{"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]}
|
||||||
|
{"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]}
|
||||||
|
{"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]}
|
||||||
|
{"effects": [["equalizer", "300", "10", "5"]]}
|
||||||
|
{"effects": [["fade", "q", "3"]]}
|
||||||
|
{"effects": [["fade", "h", "3"]]}
|
||||||
|
{"effects": [["fade", "t", "3"]]}
|
||||||
|
{"effects": [["fade", "l", "3"]]}
|
||||||
|
{"effects": [["fade", "p", "3"]]}
|
||||||
|
{"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]}
|
||||||
|
{"effects": [["flanger"]]}
|
||||||
|
{"effects": [["gain", "-l", "-6"]]}
|
||||||
|
{"effects": [["highpass", "-1", "300"]]}
|
||||||
|
{"effects": [["highpass", "-2", "300"]]}
|
||||||
|
{"effects": [["hilbert"]]}
|
||||||
|
{"effects": [["loudness"]]}
|
||||||
|
{"effects": [["lowpass", "-1", "300"]]}
|
||||||
|
{"effects": [["lowpass", "-2", "300"]]}
|
||||||
|
{"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100}
|
||||||
|
{"effects": [["oops"]]}
|
||||||
|
{"effects": [["overdrive"]]}
|
||||||
|
{"effects": [["pad"]]}
|
||||||
|
{"effects": [["phaser"]]}
|
||||||
|
{"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8}
|
||||||
|
{"effects": [["remix", "1-3,7", "3"]], "num_channels": 8}
|
||||||
|
{"effects": [["repeat"]]}
|
||||||
|
{"effects": [["reverb"]]}
|
||||||
|
{"effects": [["reverse"]]}
|
||||||
|
{"effects": [["riaa"]], "input_sample_rate": 44100}
|
||||||
|
{"effects": [["silence", "0"]]}
|
||||||
|
{"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200}
|
||||||
|
{"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800}
|
||||||
|
{"effects": [["stat"]]}
|
||||||
|
{"effects": [["stats"]]}
|
||||||
|
{"effects": [["stretch"]]}
|
||||||
|
{"effects": [["swap"]]}
|
||||||
|
{"effects": [["synth"]]}
|
||||||
|
{"effects": [["tempo", "0.9"]]}
|
||||||
|
{"effects": [["tempo", "1.1"]]}
|
||||||
|
{"effects": [["treble", "3"]]}
|
||||||
|
{"effects": [["tremolo", "300", "40"]]}
|
||||||
|
{"effects": [["tremolo", "300", "50"]]}
|
||||||
|
{"effects": [["trim", "0", "0.1"]]}
|
||||||
|
{"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000}
|
||||||
|
{"effects": [["vol", "3"]]}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue