From 98300b86e5343117f0608491ab6fe69fcec2edf5 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Wed, 27 Jul 2022 18:18:40 +0800 Subject: [PATCH 01/11] add sox load_audio&&effets --- cmake/external/pybind.cmake | 4 +- paddlespeech/audio/_internal/module_utils.py | 2 +- paddlespeech/audio/backends/sox_io_backend.py | 4 +- paddlespeech/audio/src/CMakeLists.txt | 11 +- paddlespeech/audio/src/pybind/pybind.cpp | 10 + paddlespeech/audio/src/pybind/sox/effects.cpp | 121 +++++ paddlespeech/audio/src/pybind/sox/effects.h | 18 + .../audio/src/pybind/sox/effects_chain.cpp | 236 +++++++++ .../audio/src/pybind/sox/effects_chain.h | 25 + paddlespeech/audio/src/pybind/sox/io.cpp | 146 ++++++ paddlespeech/audio/src/pybind/sox/io.h | 26 +- paddlespeech/audio/src/pybind/sox/utils.cpp | 30 +- paddlespeech/audio/src/pybind/sox/utils.h | 29 +- paddlespeech/audio/src/sox/effects.cpp | 147 ++++++ paddlespeech/audio/src/sox/effects.h | 29 ++ paddlespeech/audio/src/sox/effects_chain.cpp | 342 ++++++++++++ paddlespeech/audio/src/sox/effects_chain.h | 62 +++ paddlespeech/audio/src/sox/io.cpp | 46 +- paddlespeech/audio/src/sox/io.h | 13 +- paddlespeech/audio/src/sox/types.cpp | 143 +++++ paddlespeech/audio/src/sox/types.h | 58 +++ paddlespeech/audio/src/sox/utils.cpp | 488 ++++++++++++++++++ paddlespeech/audio/src/sox/utils.h | 120 +++++ setup.py | 2 +- 24 files changed, 2039 insertions(+), 73 deletions(-) create mode 100644 paddlespeech/audio/src/pybind/sox/effects.cpp create mode 100644 paddlespeech/audio/src/pybind/sox/effects.h create mode 100644 paddlespeech/audio/src/pybind/sox/effects_chain.cpp create mode 100644 paddlespeech/audio/src/pybind/sox/effects_chain.h create mode 100644 paddlespeech/audio/src/sox/effects.cpp create mode 100644 paddlespeech/audio/src/sox/effects.h create mode 100644 paddlespeech/audio/src/sox/effects_chain.cpp create mode 100644 paddlespeech/audio/src/sox/effects_chain.h create mode 100644 paddlespeech/audio/src/sox/types.cpp create mode 100644 paddlespeech/audio/src/sox/types.h create mode 100644 paddlespeech/audio/src/sox/utils.cpp create mode 100644 paddlespeech/audio/src/sox/utils.h diff --git a/cmake/external/pybind.cmake b/cmake/external/pybind.cmake index 941918970..ec51c1e55 100644 --- a/cmake/external/pybind.cmake +++ b/cmake/external/pybind.cmake @@ -3,8 +3,8 @@ include(ExternalProject) FetchContent_Declare( pybind - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.0.zip - URL_HASH SHA256=1c6e0141f7092867c5bf388bc3acdb2689ed49f59c3977651394c6c87ae88232 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.zip + URL_HASH SHA256=225df6e6dea7cea7c5754d4ed954e9ca7c43947b849b3795f87cb56437f1bd19 ) FetchContent_MakeAvailable(pybind) include_directories(${pybind_SOURCE_DIR}/include) diff --git a/paddlespeech/audio/_internal/module_utils.py b/paddlespeech/audio/_internal/module_utils.py index ca1ba4b84..d4a308fe7 100644 --- a/paddlespeech/audio/_internal/module_utils.py +++ b/paddlespeech/audio/_internal/module_utils.py @@ -145,4 +145,4 @@ def requires_sox(): return wrapped - return + return decorator diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index a91220042..b44ac30f8 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -29,7 +29,7 @@ def _fail_load( normalize: bool = True, channels_first: bool = True, format: Optional[str] = None, -) -> Tuple[paddle.Tensor, int]: +) -> Tuple[Tensor, int]: raise RuntimeError("Failed to load audio from {}".format(filepath)) @@ -41,6 +41,7 @@ _fallback_info_fileobj = _fail_info_fileobj _fallback_load = _fail_load _fallback_load_filebj = _fail_load_fileobj +@_mod_utils.requires_sox() def load( filepath: Union[str, Path], out: Optional[Tensor]=None, @@ -51,6 +52,7 @@ def load( filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: raise RuntimeError("No audio I/O backend is available.") +@_mod_utils.requires_sox() def save(filepath: str, src: Tensor, sample_rate: int, diff --git a/paddlespeech/audio/src/CMakeLists.txt b/paddlespeech/audio/src/CMakeLists.txt index eea07f637..7448225ef 100644 --- a/paddlespeech/audio/src/CMakeLists.txt +++ b/paddlespeech/audio/src/CMakeLists.txt @@ -35,6 +35,11 @@ if(BUILD_SOX) list( APPEND LIBPADDLEAUDIO_SOURCES + sox/io.cpp + sox/utils.cpp + sox/effects.cpp + sox/effects_chain.cpp + sox/types.cpp ) list( APPEND @@ -139,8 +144,8 @@ if(BUILD_SOX) list( APPEND EXTENSION_SOURCES - # pybind/sox/effects.cpp - # pybind/sox/effects_chain.cpp + pybind/sox/effects.cpp + pybind/sox/effects_chain.cpp pybind/sox/io.cpp pybind/sox/utils.cpp ) @@ -192,4 +197,4 @@ define_extension( # "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}" # ) # endif() -endif() \ No newline at end of file +endif() diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index a8b3e5d63..791ac7879 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -3,6 +3,7 @@ #include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h" #include "paddlespeech/audio/src/pybind/sox/io.h" +#include "paddlespeech/audio/src/pybind/sox/effects.h" #include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h" PYBIND11_MODULE(_paddleaudio, m) { @@ -13,6 +14,15 @@ PYBIND11_MODULE(_paddleaudio, m) { m.def("get_info_fileobj", &paddleaudio::sox_io::get_info_fileobj, "Get metadata of audio in file object."); + m.def("load_audio_fileobj", + &paddleaudio::sox_io::load_audio_fileobj, + "Load audio from file object."); + m.def("save_audio_fileobj", + &paddleaudio::sox_io::save_audio_fileobj, + "Save audio to file obj."); + m.def("apply_effects_fileobj", + &paddleaudio::sox_effects::apply_effects_fileobj, + "Decode audio data from file-like obj and apply effects."); #endif #ifdef INCLUDE_KALDI diff --git a/paddlespeech/audio/src/pybind/sox/effects.cpp b/paddlespeech/audio/src/pybind/sox/effects.cpp new file mode 100644 index 000000000..96907a670 --- /dev/null +++ b/paddlespeech/audio/src/pybind/sox/effects.cpp @@ -0,0 +1,121 @@ +#include "paddlespeech/audio/src/pybind/sox/effects.h" +#include "paddlespeech/audio/src/pybind/sox/effects_chain.h" +#include "paddlespeech/audio/src/pybind/sox/utils.h" + +using namespace paddleaudio::sox_utils; + +namespace paddleaudio::sox_effects { + +// Streaming decoding over file-like object is tricky because libsox operates on +// FILE pointer. The folloing is what `sox` and `play` commands do +// - file input -> FILE pointer +// - URL input -> call wget in suprocess and pipe the data -> FILE pointer +// - stdin -> FILE pointer +// +// We want to, instead, fetch byte strings chunk by chunk, consume them, and +// discard. +// +// Here is the approach +// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial +// chunk of byte string +// This will perform header-based format detection, if necessary, then fill +// the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen, +// which returns FILE* which points the buffer of the provided byte string. +// 2. Each time sox reads a chunk from the FILE*, we update the underlying +// buffer in a way that it +// starts with unseen data, and append the new data read from the given +// fileobj. This will trick libsox as if it keeps reading from the FILE* +// continuously. +// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp +auto apply_effects_fileobj( + py::object fileobj, + const std::vector>& effects, + tl::optional normalize, + tl::optional channels_first, + tl::optional format) + -> tl::optional> { + // Prepare the buffer used throughout the lifecycle of SoxEffectChain. + // + // For certain format (such as FLAC), libsox keeps reading the content at + // the initialization unless it reaches EOF even when the header is properly + // parsed. (Making buffer size 8192, which is way bigger than the header, + // resulted in libsox consuming all the buffer content at the time it opens + // the file.) Therefore buffer has to always contain valid data, except after + // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we + // first check if there is enough data to fill the buffer. `read_fileobj` + // repeatedly calls `read` method until it receives the requested length of + // bytes or it reaches EOF. If we get bytes shorter than requested, that means + // the whole audio data are fetched. + // + // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`. + const auto capacity = [&]() { + // NOTE: + // Use the abstraction provided by `libpaddleaudio` to access the global + // config defined by libsox. Directly using `sox_get_globals` function will + // end up retrieving the static variable defined in `_paddleaudio`, which is + // not correct. + const auto bufsiz = get_buffer_size(); + const int64_t kDefaultCapacityInBytes = 256; + return (bufsiz > kDefaultCapacityInBytes) ? bufsiz + : kDefaultCapacityInBytes; + }(); + std::string buffer(capacity, '\0'); + auto* in_buf = const_cast(buffer.data()); + auto num_read = read_fileobj(&fileobj, capacity, in_buf); + // If the file is shorter than 256, then libsox cannot read the header. + auto in_buffer_size = (num_read > 256) ? num_read : 256; + + // Open file (this starts reading the header) + // When opening a file there are two functions that can touches FILE*. + // * `auto_detect_format` + // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43 + // * `startread` handler of detected format. + // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574 + // To see the handler of a particular format, go to + // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/.c + // For example, voribs can be found + // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158 + SoxFormat sf(sox_open_mem_read( + in_buf, + in_buffer_size, + /*signal=*/nullptr, + /*encoding=*/nullptr, + /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); + + // In case of streamed data, length can be 0 + if (static_cast(sf) == nullptr || + sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { + return {}; + } + + // Prepare output buffer + std::vector out_buffer; + out_buffer.reserve(sf->signal.length); + + // Create and run SoxEffectsChain + const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision); + paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain( + /*input_encoding=*/sf->encoding, + /*output_encoding=*/get_tensor_encodinginfo(dtype)); + chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj); + for (const auto& effect : effects) { + chain.addEffect(effect); + } + chain.addOutputBuffer(&out_buffer); + chain.run(); + + // Create tensor from buffer + bool channels_first_ = channels_first.value_or(true); + auto tensor = convert_to_tensor( + /*buffer=*/out_buffer.data(), + /*num_samples=*/out_buffer.size(), + /*num_channels=*/chain.getOutputNumChannels(), + dtype, + normalize.value_or(true), + channels_first_); + + return std::forward_as_tuple( + tensor, static_cast(chain.getOutputSampleRate())); +} + +} // namespace paddleaudio::sox_effects diff --git a/paddlespeech/audio/src/pybind/sox/effects.h b/paddlespeech/audio/src/pybind/sox/effects.h new file mode 100644 index 000000000..5e67cb011 --- /dev/null +++ b/paddlespeech/audio/src/pybind/sox/effects.h @@ -0,0 +1,18 @@ +#include +#include + +#include "paddlespeech/audio/src/optional/optional.hpp" + +namespace py = pybind11; + +namespace paddleaudio::sox_effects { + +auto apply_effects_fileobj( + py::object fileobj, + const std::vector>& effects, + tl::optional normalize, + tl::optional channels_first, + tl::optional format) + -> tl::optional>; + +} // namespace paddleaudio::sox_effects diff --git a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp new file mode 100644 index 000000000..a106209d6 --- /dev/null +++ b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp @@ -0,0 +1,236 @@ +#include + +#include "paddlespeech/audio/src/pybind/sox/effects_chain.h" +#include "paddlespeech/audio/src/pybind/sox/utils.h" + +using namespace paddleaudio::sox_utils; + +namespace paddleaudio::sox_effects_chain { + +namespace { + +/// helper classes for passing file-like object to SoxEffectChain +struct FileObjInputPriv { + sox_format_t* sf; + py::object* fileobj; + bool eof_reached; + char* buffer; + uint64_t buffer_size; +}; + +struct FileObjOutputPriv { + sox_format_t* sf; + py::object* fileobj; + char** buffer; + size_t* buffer_size; +}; + +/// Callback function to feed byte string +/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278 +auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) + -> int { + auto priv = static_cast(effp->priv); + auto sf = priv->sf; + auto buffer = priv->buffer; + + // 1. Refresh the buffer + // + // NOTE: + // Since the underlying FILE* was opened with `fmemopen`, the only way + // libsox detect EOF is reaching the end of the buffer. (null byte won't + // help) Therefore we need to align the content at the end of buffer, + // otherwise, libsox will keep reading the content beyond intended length. + // + // Before: + // + // |<-------consumed------>|<---remaining--->| + // |***********************|-----------------| + // ^ ftell + // + // After: + // + // |<-offset->|<---remaining--->|<-new data->| + // |**********|-----------------|++++++++++++| + // ^ ftell + + // NOTE: + // Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are + // supposed to be in sync, but there are cases (Vorbis) they are not + // in sync and `tell_off` has seemingly uninitialized value, which + // leads num_remain to be negative and cause segmentation fault + // in `memmove`. + const auto tell = ftell((FILE*)sf->fp); + if (tell < 0) { + throw std::runtime_error("Internal Error: ftell failed."); + } + const auto num_consumed = static_cast(tell); + if (num_consumed > priv->buffer_size) { + throw std::runtime_error("Internal Error: buffer overrun."); + } + + const auto num_remain = priv->buffer_size - num_consumed; + + // 1.1. Fetch the data to see if there is data to fill the buffer + size_t num_refill = 0; + std::string chunk(num_consumed, '\0'); + if (num_consumed && !priv->eof_reached) { + num_refill = read_fileobj( + priv->fileobj, num_consumed, const_cast(chunk.data())); + if (num_refill < num_consumed) { + priv->eof_reached = true; + } + } + const auto offset = num_consumed - num_refill; + + // 1.2. Move the unconsumed data towards the beginning of buffer. + if (num_remain) { + auto src = static_cast(buffer + num_consumed); + auto dst = static_cast(buffer + offset); + memmove(dst, src, num_remain); + } + + // 1.3. Refill the remaining buffer. + if (num_refill) { + auto src = static_cast(const_cast(chunk.c_str())); + auto dst = buffer + offset + num_remain; + memcpy(dst, src, num_refill); + } + + // 1.4. Set the file pointer to the new offset + sf->tell_off = offset; + fseek((FILE*)sf->fp, offset, SEEK_SET); + + // 2. Perform decoding operation + // The following part is practically same as "input" effect + // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48 + + // At this point, osamp represents the buffer size in bytes, + // but sox_read expects the maximum number of samples ready to read. + // Normally, this is fine, but in case when the samples are not 4-byte + // aligned, (e.g. sample is 24bits), the resulting signal is not correct. + // https://github.com/pytorch/audio/issues/2083 + if (sf->encoding.bits_per_sample > 0) + *osamp /= (sf->encoding.bits_per_sample / 8); + + // Ensure that it's a multiple of the number of channels + *osamp -= *osamp % effp->out_signal.channels; + + // Read up to *osamp samples into obuf; + // store the actual number read back to *osamp + *osamp = sox_read(sf, obuf, *osamp); + + // Decoding is finished when fileobject is exhausted and sox can no longer + // decode a sample. + return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS; +} + +auto fileobj_output_flow( + sox_effect_t* effp, + sox_sample_t const* ibuf, + sox_sample_t* obuf LSX_UNUSED, + size_t* isamp, + size_t* osamp) -> int { + *osamp = 0; + if (*isamp) { + auto priv = static_cast(effp->priv); + auto sf = priv->sf; + auto fp = static_cast(sf->fp); + auto fileobj = priv->fileobj; + auto buffer = priv->buffer; + + // Encode chunk + auto num_samples_written = sox_write(sf, ibuf, *isamp); + fflush(fp); + + // Copy the encoded chunk to python object. + fileobj->attr("write")(py::bytes(*buffer, ftell(fp))); + + // Reset FILE* + sf->tell_off = 0; + fseek(fp, 0, SEEK_SET); + + if (num_samples_written != *isamp) { + if (sf->sox_errno) { + std::ostringstream stream; + stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " " + << sf->filename; + throw std::runtime_error(stream.str()); + } + return SOX_EOF; + } + } + return SOX_SUCCESS; +} + +auto get_fileobj_input_handler() -> sox_effect_handler_t* { + static sox_effect_handler_t handler{ + /*name=*/"input_fileobj_object", + /*usage=*/nullptr, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/nullptr, + /*start=*/nullptr, + /*flow=*/nullptr, + /*drain=*/fileobj_input_drain, + /*stop=*/nullptr, + /*kill=*/nullptr, + /*priv_size=*/sizeof(FileObjInputPriv)}; + return &handler; +} + +auto get_fileobj_output_handler() -> sox_effect_handler_t* { + static sox_effect_handler_t handler{ + /*name=*/"output_fileobj_object", + /*usage=*/nullptr, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/nullptr, + /*start=*/nullptr, + /*flow=*/fileobj_output_flow, + /*drain=*/nullptr, + /*stop=*/nullptr, + /*kill=*/nullptr, + /*priv_size=*/sizeof(FileObjOutputPriv)}; + return &handler; +} + +} // namespace + +void SoxEffectsChainPyBind::addInputFileObj( + sox_format_t* sf, + char* buffer, + uint64_t buffer_size, + py::object* fileobj) { + in_sig_ = sf->signal; + interm_sig_ = in_sig_; + + SoxEffect e(sox_create_effect(get_fileobj_input_handler())); + auto priv = static_cast(e->priv); + priv->sf = sf; + priv->fileobj = fileobj; + priv->eof_reached = false; + priv->buffer = buffer; + priv->buffer_size = buffer_size; + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + throw std::runtime_error( + "Internal Error: Failed to add effect: input fileobj"); + } +} + +void SoxEffectsChainPyBind::addOutputFileObj( + sox_format_t* sf, + char** buffer, + size_t* buffer_size, + py::object* fileobj) { + out_sig_ = sf->signal; + SoxEffect e(sox_create_effect(get_fileobj_output_handler())); + auto priv = static_cast(e->priv); + priv->sf = sf; + priv->fileobj = fileobj; + priv->buffer = buffer; + priv->buffer_size = buffer_size; + if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) { + throw std::runtime_error( + "Internal Error: Failed to add effect: output fileobj"); + } +} + +} // namespace paddleaudio::sox_effects_chain diff --git a/paddlespeech/audio/src/pybind/sox/effects_chain.h b/paddlespeech/audio/src/pybind/sox/effects_chain.h new file mode 100644 index 000000000..3de0161e3 --- /dev/null +++ b/paddlespeech/audio/src/pybind/sox/effects_chain.h @@ -0,0 +1,25 @@ +#pragma once + +#include "paddlespeech/audio/src/sox/effects_chain.h" + +namespace paddleaudio::sox_effects_chain { + +class SoxEffectsChainPyBind : public SoxEffectsChain { + using SoxEffectsChain::SoxEffectsChain; + + public: + void addInputFileObj( + sox_format_t* sf, + char* buffer, + uint64_t buffer_size, + py::object* fileobj); + + void addOutputFileObj( + sox_format_t* sf, + char** buffer, + size_t* buffer_size, + py::object* fileobj); +}; + +} // namespace paddleaudio::sox_effects_chain + diff --git a/paddlespeech/audio/src/pybind/sox/io.cpp b/paddlespeech/audio/src/pybind/sox/io.cpp index d5bd8fd65..6e3230f27 100644 --- a/paddlespeech/audio/src/pybind/sox/io.cpp +++ b/paddlespeech/audio/src/pybind/sox/io.cpp @@ -2,7 +2,14 @@ // All rights reserved. #include "paddlespeech/audio/src/pybind/sox/io.h" +#include "paddlespeech/audio/src/pybind/sox/effects.h" +#include "paddlespeech/audio/src/pybind/sox/effects_chain.h" #include "paddlespeech/audio/src/pybind/sox/utils.h" +#include "paddlespeech/audio/src/optional/optional.hpp" + +#include "paddlespeech/audio/src/sox/io.h" +#include "paddlespeech/audio/src/sox/types.h" +#include "paddlespeech/audio/src/sox/utils.h" using namespace paddleaudio::sox_utils; @@ -28,6 +35,35 @@ auto get_info_file(const std::string &path, const std::string &format) get_encoding(sf->encoding.encoding)); } +std::vector> get_effects( + const tl::optional& frame_offset, + const tl::optional& num_frames) { + const auto offset = frame_offset.value_or(0); + if (offset < 0) { + throw std::runtime_error( + "Invalid argument: frame_offset must be non-negative."); + } + const auto frames = num_frames.value_or(-1); + if (frames == 0 || frames < -1) { + throw std::runtime_error( + "Invalid argument: num_frames must be -1 or greater than 0."); + } + + std::vector> effects; + if (frames != -1) { + std::ostringstream os_offset, os_frames; + os_offset << offset << "s"; + os_frames << "+" << frames << "s"; + effects.emplace_back( + std::vector{"trim", os_offset.str(), os_frames.str()}); + } else if (offset != 0) { + std::ostringstream os_offset; + os_offset << offset << "s"; + effects.emplace_back(std::vector{"trim", os_offset.str()}); + } + return effects; +} + auto get_info_fileobj(py::object fileobj, const std::string &format) -> std::tuple { const auto capacity = [&]() { @@ -60,5 +96,115 @@ auto get_info_fileobj(py::object fileobj, const std::string &format) get_encoding(sf->encoding.encoding)); } +tl::optional> load_audio_fileobj( + py::object fileobj, + const tl::optional& frame_offset, + const tl::optional& num_frames, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format) { + auto effects = get_effects(frame_offset, num_frames); + return paddleaudio::sox_effects::apply_effects_fileobj( + std::move(fileobj), effects, normalize, channels_first, std::move(format)); +} + +namespace { +// helper class to automatically release buffer, to be used by +// save_audio_fileobj +struct AutoReleaseBuffer { + char* ptr; + size_t size; + + AutoReleaseBuffer() : ptr(nullptr), size(0) {} + AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete; + AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete; + auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete; + auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete; + ~AutoReleaseBuffer() { + if (ptr) { + free(ptr); + } + } +}; + +} // namespace + +void save_audio_fileobj( + py::object fileobj, + py::array tensor, + int64_t sample_rate, + bool channels_first, + tl::optional compression, + tl::optional format, + tl::optional encoding, + tl::optional bits_per_sample) { + + if (!format.has_value()) { + throw std::runtime_error( + "`format` is required when saving to file object."); + } + const auto filetype = format.value(); + + if (filetype == "amr-nb") { + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + if (num_channels != 1) { + throw std::runtime_error( + "amr-nb format only supports single channel audio."); + } + } else if (filetype == "htk") { + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + if (num_channels != 1) { + throw std::runtime_error( + "htk format only supports single channel audio."); + } + } else if (filetype == "gsm") { + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + if (num_channels != 1) { + throw std::runtime_error( + "gsm format only supports single channel audio."); + } + if (sample_rate != 8000) { + throw std::runtime_error( + "gsm format only supports a sampling rate of 8kHz."); + } + } + + const auto signal_info = + get_signalinfo(&tensor, sample_rate, filetype, channels_first); + const auto encoding_info = get_encodinginfo_for_save( + filetype, + tensor.dtype(), + compression, + std::move(encoding), + bits_per_sample); + + AutoReleaseBuffer buffer; + + SoxFormat sf(sox_open_memstream_write( + &buffer.ptr, + &buffer.size, + &signal_info, + &encoding_info, + filetype.c_str(), + /*oob=*/nullptr)); + + if (static_cast(sf) == nullptr) { + throw std::runtime_error( + "Error saving audio file: failed to open memory stream."); + } + + paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain( + /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()), + /*output_encoding=*/sf->encoding); + chain.addInputTensor(&tensor, sample_rate, channels_first); + chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj); + chain.run(); + + // Closing the sox_format_t is necessary for flushing the last chunk to the + // buffer + sf.close(); + fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size)); +} + } // namespace paddleaudio } // namespace sox_io diff --git a/paddlespeech/audio/src/pybind/sox/io.h b/paddlespeech/audio/src/pybind/sox/io.h index 13381c68d..ca03b5db3 100644 --- a/paddlespeech/audio/src/pybind/sox/io.h +++ b/paddlespeech/audio/src/pybind/sox/io.h @@ -1,11 +1,12 @@ // Copyright (c) 2017 Facebook Inc. (Soumith Chintala), // All rights reserved. -#ifndef PADDLEAUDIO_PYBIND_SOX_IO_H -#define PADDLEAUDIO_PYBIND_SOX_IO_H +#pragma once #include "paddlespeech/audio/src/pybind/sox/utils.h" +namespace py = pybind11; + namespace paddleaudio { namespace sox_io { @@ -15,7 +16,24 @@ auto get_info_file(const std::string &path, const std::string &format) auto get_info_fileobj(py::object fileobj, const std::string &format) -> std::tuple; +auto load_audio_fileobj( + py::object fileobj, + tl::optional frame_offset, + tl::optional num_frames, + tl::optional normalize, + tl::optional channels_first, + tl::optional format) + -> tl::optional>; + +void save_audio_fileobj( + py::object fileobj, + py::array tensor, + int64_t sample_rate, + bool channels_first, + tl::optional compression, + tl::optional format, + tl::optional encoding, + tl::optional bits_per_sample); + } // namespace paddleaudio } // namespace sox_io - -#endif diff --git a/paddlespeech/audio/src/pybind/sox/utils.cpp b/paddlespeech/audio/src/pybind/sox/utils.cpp index 53a3cbe41..24a2817d2 100644 --- a/paddlespeech/audio/src/pybind/sox/utils.cpp +++ b/paddlespeech/audio/src/pybind/sox/utils.cpp @@ -8,6 +8,34 @@ namespace paddleaudio { namespace sox_utils { +auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer) + -> uint64_t { + uint64_t num_read = 0; + while (num_read < size) { + auto request = size - num_read; + auto chunk = static_cast( + static_cast(fileobj->attr("read")(request))); + auto chunk_len = chunk.length(); + if (chunk_len == 0) { + break; + } + if (chunk_len > request) { + std::ostringstream message; + message + << "Requested up to " << request << " bytes but, " + << "received " << chunk_len << " bytes. " + << "The given object does not confirm to read protocol of file " + "object."; + throw std::runtime_error(message.str()); + } + memcpy(buffer, chunk.data(), chunk_len); + buffer += chunk_len; + num_read += chunk_len; + } + return num_read; +} + +/* SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {} SoxFormat::~SoxFormat() { close(); } @@ -96,6 +124,6 @@ std::string get_encoding(sox_encoding_t encoding) { return "UNKNOWN"; } } - +*/ } // namespace paddleaudio } // namespace sox_utils diff --git a/paddlespeech/audio/src/pybind/sox/utils.h b/paddlespeech/audio/src/pybind/sox/utils.h index b294b8083..fa931b1a9 100644 --- a/paddlespeech/audio/src/pybind/sox/utils.h +++ b/paddlespeech/audio/src/pybind/sox/utils.h @@ -4,39 +4,18 @@ #pragma once #include +#include #include +#include "paddlespeech/audio/src/optional/optional.hpp" +#include "paddlespeech/audio/src/sox/utils.h" +#include "paddlespeech/audio/src/sox/types.h" namespace py = pybind11; namespace paddleaudio { namespace sox_utils { -/// helper class to automatically close sox_format_t* -struct SoxFormat { - explicit SoxFormat(sox_format_t *fd) noexcept; - SoxFormat(const SoxFormat &other) = delete; - SoxFormat(SoxFormat &&other) = delete; - SoxFormat &operator=(const SoxFormat &other) = delete; - SoxFormat &operator=(SoxFormat &&other) = delete; - ~SoxFormat(); - sox_format_t *operator->() const noexcept; - operator sox_format_t *() const noexcept; - - void close(); - - private: - sox_format_t *fd_; -}; - auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t; -int64_t get_buffer_size(); - -void validate_input_file(const SoxFormat &sf, const std::string &path); - -void validate_input_memfile(const SoxFormat &sf); - -std::string get_encoding(sox_encoding_t encoding); - } // namespace paddleaudio } // namespace sox_utils diff --git a/paddlespeech/audio/src/sox/effects.cpp b/paddlespeech/audio/src/sox/effects.cpp new file mode 100644 index 000000000..f2687f93f --- /dev/null +++ b/paddlespeech/audio/src/sox/effects.cpp @@ -0,0 +1,147 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp + +#include +#include + +#include "paddlespeech/audio/src/sox/effects.h" +#include "paddlespeech/audio/src/sox/effects_chain.h" +#include "paddlespeech/audio/src/sox/utils.h" + +using namespace paddleaudio::sox_utils; + +namespace paddleaudio::sox_effects { + +namespace { + +enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown }; +SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized; +std::mutex SOX_RESOUCE_STATE_MUTEX; + +} // namespace + +void initialize_sox_effects() { + const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); + + switch (SOX_RESOURCE_STATE) { + case NotInitialized: + if (sox_init() != SOX_SUCCESS) { + throw std::runtime_error("Failed to initialize sox effects."); + }; + SOX_RESOURCE_STATE = Initialized; + break; + case Initialized: + break; + case ShutDown: + throw std::runtime_error( + "SoX Effects has been shut down. Cannot initialize again."); + } +}; + +void shutdown_sox_effects() { + const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); + + switch (SOX_RESOURCE_STATE) { + case NotInitialized: + throw std::runtime_error( + "SoX Effects is not initialized. Cannot shutdown."); + case Initialized: + if (sox_quit() != SOX_SUCCESS) { + throw std::runtime_error("Failed to initialize sox effects."); + }; + SOX_RESOURCE_STATE = ShutDown; + break; + case ShutDown: + break; + } +} + +auto apply_effects_tensor( + py::array waveform, + int64_t sample_rate, + const std::vector>& effects, + bool channels_first) -> std::tuple { + validate_input_tensor(waveform); + + // Create SoxEffectsChain + const auto dtype = waveform.dtype(); + paddleaudio::sox_effects_chain::SoxEffectsChain chain( + /*input_encoding=*/get_tensor_encodinginfo(dtype), + /*output_encoding=*/get_tensor_encodinginfo(dtype)); + + // Prepare output buffer + std::vector out_buffer; + out_buffer.reserve(waveform.size()); + + // Build and run effects chain + chain.addInputTensor(&waveform, sample_rate, channels_first); + for (const auto& effect : effects) { + chain.addEffect(effect); + } + chain.addOutputBuffer(&out_buffer); + chain.run(); + + // Create tensor from buffer + auto out_tensor = convert_to_tensor( + /*buffer=*/out_buffer.data(), + /*num_samples=*/out_buffer.size(), + /*num_channels=*/chain.getOutputNumChannels(), + dtype, + /*normalize=*/false, + channels_first); + + return std::tuple( + out_tensor, chain.getOutputSampleRate()); +} + +auto apply_effects_file( + const std::string& path, + const std::vector>& effects, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format) + -> tl::optional> { + // Open input file + SoxFormat sf(sox_open_read( + path.c_str(), + /*signal=*/nullptr, + /*encoding=*/nullptr, + /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); + + if (static_cast(sf) == nullptr || + sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { + return {}; + } + + const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision); + + // Prepare output + std::vector out_buffer; + out_buffer.reserve(sf->signal.length); + + // Create and run SoxEffectsChain + paddleaudio::sox_effects_chain::SoxEffectsChain chain( + /*input_encoding=*/sf->encoding, + /*output_encoding=*/get_tensor_encodinginfo(dtype)); + + chain.addInputFile(sf); + for (const auto& effect : effects) { + chain.addEffect(effect); + } + chain.addOutputBuffer(&out_buffer); + chain.run(); + + // Create tensor from buffer + bool channels_first_ = channels_first.value_or(true); + auto tensor = convert_to_tensor( + /*buffer=*/out_buffer.data(), + /*num_samples=*/out_buffer.size(), + /*num_channels=*/chain.getOutputNumChannels(), + dtype, + normalize.value_or(true), + channels_first_); + + return std::tuple( + tensor, chain.getOutputSampleRate()); +} + +} // namespace paddleaudio::sox_effects diff --git a/paddlespeech/audio/src/sox/effects.h b/paddlespeech/audio/src/sox/effects.h new file mode 100644 index 000000000..81db23b44 --- /dev/null +++ b/paddlespeech/audio/src/sox/effects.h @@ -0,0 +1,29 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h +#pragma once + +#include +#include "paddlespeech/audio/src/sox/utils.h" + +namespace py = pybind11; + +namespace paddleaudio::sox_effects { + +void initialize_sox_effects(); + +void shutdown_sox_effects(); + +auto apply_effects_tensor( + py::array waveform, + int64_t sample_rate, + const std::vector>& effects, + bool channels_first) -> std::tuple; + +auto apply_effects_file( + const std::string& path, + const std::vector>& effects, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format) + -> tl::optional>; + +} // namespace torchaudio::sox_effects diff --git a/paddlespeech/audio/src/sox/effects_chain.cpp b/paddlespeech/audio/src/sox/effects_chain.cpp new file mode 100644 index 000000000..1b13fd186 --- /dev/null +++ b/paddlespeech/audio/src/sox/effects_chain.cpp @@ -0,0 +1,342 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp + +#include "paddlespeech/audio/src/sox/effects_chain.h" +#include "paddlespeech/audio/src/sox/utils.h" + +using namespace paddleaudio::sox_utils; + +namespace paddleaudio { +namespace sox_effects_chain { + +namespace { + +/// helper classes for passing the location of input tensor and output buffer +/// +/// drain/flow callback functions require plaing C style function signature and +/// the way to pass extra data is to attach data to sox_effect_t::priv pointer. +/// The following structs will be assigned to sox_effect_t::priv pointer which +/// gives sox_effect_t an access to input Tensor and output buffer object. +struct TensorInputPriv { + size_t index; + py::array* waveform; + int64_t sample_rate; + bool channels_first; +}; + +struct TensorOutputPriv { + std::vector* buffer; +}; +struct FileOutputPriv { + sox_format_t* sf; +}; + +/// Callback function to feed Tensor data to SoxEffectChain. +int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { + // Retrieve the input Tensor and current index + auto priv = static_cast(effp->priv); + auto index = priv->index; + auto tensor = *(priv->waveform); + auto num_channels = effp->out_signal.channels; + + // Adjust the number of samples to read + const size_t num_samples = tensor.size(); + if (index + *osamp > num_samples) { + *osamp = num_samples - index; + } + // Ensure that it's a multiple of the number of channels + *osamp -= *osamp % num_channels; + + // Slice the input Tensor + // refacor this module, chunk + auto i_frame = index / num_channels; + auto num_frames = *osamp / num_channels; + py::array chunk(tensor.dtype(), {num_frames*num_channels}); + py::buffer_info ori_info = tensor.request(); + py::buffer_info info = chunk.request(); + char* ori_start_ptr = (char*)ori_info.ptr + index * chunk.itemsize() / sizeof(char); + std::memcpy(info.ptr, ori_start_ptr, chunk.nbytes()); + + py::dtype chunk_type = py::dtype("i"); // dtype int32 + py::array new_chunk = py::array(chunk_type, chunk.shape()); + py::buffer_info new_info = new_chunk.request(); + void* ptr = (void*) info.ptr; + int* new_ptr = (int*) new_info.ptr; + // Convert to sox_sample_t (int32_t) + switch (chunk.dtype().num()) { + //case c10::ScalarType::Float: { + case 11: { + // Need to convert to 64-bit precision so that + // values around INT32_MIN/MAX are handled correctly. + float* ptr_f = (float*)ptr; + for (int idx = 0; idx < chunk.size(); ++idx) { + double elem = *ptr_f * 2147483648.; + // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX); + if (elem > INT32_MAX) { + *new_ptr = INT32_MAX; + } else if (elem < INT32_MIN) { + *new_ptr = INT32_MIN; + } else { *new_ptr = elem; } + } + break; + } + //case c10::ScalarType::Int: { + case 5: { + break; + } + // case short + case 3: { + int16_t* ptr_s = (int16_t*) ptr; + for (int idx = 0; idx < chunk.size(); ++idx) { + *new_ptr = *ptr_s * 65536; + } + break; + } + // case byte + case 1: { + int8_t* ptr_b = (int8_t*) ptr; + for (int idx = 0; idx < chunk.size(); ++idx) { + *new_ptr = (*ptr_b - 128) * 16777216; + } + break; + } + default: + throw std::runtime_error("Unexpected dtype."); + } + // Write to buffer + memcpy(obuf, (int*)new_info.ptr, *osamp * 4); + priv->index += *osamp; + return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS; +} + +/// Callback function to fetch data from SoxEffectChain. +int tensor_output_flow( + sox_effect_t* effp, + sox_sample_t const* ibuf, + sox_sample_t* obuf LSX_UNUSED, + size_t* isamp, + size_t* osamp) { + *osamp = 0; + // Get output buffer + auto out_buffer = static_cast(effp->priv)->buffer; + // Append at the end + out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp); + return SOX_SUCCESS; +} + +int file_output_flow( + sox_effect_t* effp, + sox_sample_t const* ibuf, + sox_sample_t* obuf LSX_UNUSED, + size_t* isamp, + size_t* osamp) { + *osamp = 0; + if (*isamp) { + auto sf = static_cast(effp->priv)->sf; + if (sox_write(sf, ibuf, *isamp) != *isamp) { + if (sf->sox_errno) { + std::ostringstream stream; + stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " " + << sf->filename; + throw std::runtime_error(stream.str()); + } + return SOX_EOF; + } + } + return SOX_SUCCESS; +} + +sox_effect_handler_t* get_tensor_input_handler() { + static sox_effect_handler_t handler{ + /*name=*/"input_tensor", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/NULL, + /*drain=*/tensor_input_drain, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(TensorInputPriv)}; + return &handler; +} + +sox_effect_handler_t* get_tensor_output_handler() { + static sox_effect_handler_t handler{ + /*name=*/"output_tensor", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/tensor_output_flow, + /*drain=*/NULL, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(TensorOutputPriv)}; + return &handler; +} + +sox_effect_handler_t* get_file_output_handler() { + static sox_effect_handler_t handler{ + /*name=*/"output_file", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/file_output_flow, + /*drain=*/NULL, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(FileOutputPriv)}; + return &handler; +} + +} // namespace + +SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {} + +SoxEffect::~SoxEffect() { + if (se_ != nullptr) { + free(se_); + } +} + +SoxEffect::operator sox_effect_t*() const { + return se_; +} + +auto SoxEffect::operator->() noexcept -> sox_effect_t* { + return se_; +} + +SoxEffectsChain::SoxEffectsChain( + sox_encodinginfo_t input_encoding, + sox_encodinginfo_t output_encoding) + : in_enc_(input_encoding), + out_enc_(output_encoding), + in_sig_(), + interm_sig_(), + out_sig_(), + sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) { + if (!sec_) { + throw std::runtime_error("Failed to create effect chain."); + } +} + +SoxEffectsChain::~SoxEffectsChain() { + if (sec_ != nullptr) { + sox_delete_effects_chain(sec_); + } +} + +void SoxEffectsChain::run() { + sox_flow_effects(sec_, NULL, NULL); +} + +void SoxEffectsChain::addInputTensor( + py::array* waveform, + int64_t sample_rate, + bool channels_first) { + in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first); + interm_sig_ = in_sig_; + SoxEffect e(sox_create_effect(get_tensor_input_handler())); + auto priv = static_cast(e->priv); + priv->index = 0; + priv->waveform = waveform; + priv->sample_rate = sample_rate; + priv->channels_first = channels_first; + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + throw std::runtime_error( + "Internal Error: Failed to add effect: input_tensor"); + } +} + +void SoxEffectsChain::addOutputBuffer( + std::vector* output_buffer) { + SoxEffect e(sox_create_effect(get_tensor_output_handler())); + static_cast(e->priv)->buffer = output_buffer; + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + throw std::runtime_error( + "Internal Error: Failed to add effect: output_tensor"); + } +} + +void SoxEffectsChain::addInputFile(sox_format_t* sf) { + in_sig_ = sf->signal; + interm_sig_ = in_sig_; + SoxEffect e(sox_create_effect(sox_find_effect("input"))); + char* opts[] = {(char*)sf}; + sox_effect_options(e, 1, opts); + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Internal Error: Failed to add effect: input " << sf->filename; + throw std::runtime_error(stream.str()); + } +} + +void SoxEffectsChain::addOutputFile(sox_format_t* sf) { + out_sig_ = sf->signal; + SoxEffect e(sox_create_effect(get_file_output_handler())); + static_cast(e->priv)->sf = sf; + if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Internal Error: Failed to add effect: output " << sf->filename; + throw std::runtime_error(stream.str()); + } +} + +void SoxEffectsChain::addEffect(const std::vector effect) { + const auto num_args = effect.size(); + if (num_args == 0) { + throw std::runtime_error("Invalid argument: empty effect."); + } + const auto name = effect[0]; + if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) { + std::ostringstream stream; + stream << "Unsupported effect: " << name; + throw std::runtime_error(stream.str()); + } + + auto returned_effect = sox_find_effect(name.c_str()); + if (!returned_effect) { + std::ostringstream stream; + stream << "Unsupported effect: " << name; + throw std::runtime_error(stream.str()); + } + SoxEffect e(sox_create_effect(returned_effect)); + const auto num_options = num_args - 1; + + std::vector opts; + for (size_t i = 1; i < num_args; ++i) { + opts.push_back((char*)effect[i].c_str()); + } + if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) != + SOX_SUCCESS) { + std::ostringstream stream; + stream << "Invalid effect option:"; + for (const auto& v : effect) { + stream << " " << v; + } + throw std::runtime_error(stream.str()); + } + + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Internal Error: Failed to add effect: \"" << name; + for (size_t i = 1; i < num_args; ++i) { + stream << " " << effect[i]; + } + stream << "\""; + throw std::runtime_error(stream.str()); + } +} + +int64_t SoxEffectsChain::getOutputNumChannels() { + return interm_sig_.channels; +} + +int64_t SoxEffectsChain::getOutputSampleRate() { + return interm_sig_.rate; +} + +} // namespace sox_effects_chain +} // namespace paddleaudio diff --git a/paddlespeech/audio/src/sox/effects_chain.h b/paddlespeech/audio/src/sox/effects_chain.h new file mode 100644 index 000000000..87a046975 --- /dev/null +++ b/paddlespeech/audio/src/sox/effects_chain.h @@ -0,0 +1,62 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h +#pragma once + +#include +#include "paddlespeech/audio/src/sox/utils.h" + +namespace paddleaudio { +namespace sox_effects_chain { + +// Helper struct to safely close sox_effect_t* pointer returned by +// sox_create_effect + +struct SoxEffect { + explicit SoxEffect(sox_effect_t* se) noexcept; + SoxEffect(const SoxEffect& other) = delete; + SoxEffect(const SoxEffect&& other) = delete; + auto operator=(const SoxEffect& other) -> SoxEffect& = delete; + auto operator=(SoxEffect&& other) -> SoxEffect& = delete; + ~SoxEffect(); + operator sox_effect_t*() const; + auto operator->() noexcept -> sox_effect_t*; + + private: + sox_effect_t* se_; +}; + +// Helper struct to safely close sox_effects_chain_t with handy methods +class SoxEffectsChain { + const sox_encodinginfo_t in_enc_; + const sox_encodinginfo_t out_enc_; + + protected: + sox_signalinfo_t in_sig_; + sox_signalinfo_t interm_sig_; + sox_signalinfo_t out_sig_; + sox_effects_chain_t* sec_; + + public: + explicit SoxEffectsChain( + sox_encodinginfo_t input_encoding, + sox_encodinginfo_t output_encoding); + SoxEffectsChain(const SoxEffectsChain& other) = delete; + SoxEffectsChain(const SoxEffectsChain&& other) = delete; + SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete; + SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete; + ~SoxEffectsChain(); + void run(); + void addInputTensor( + py::array* waveform, + int64_t sample_rate, + bool channels_first); + void addInputFile(sox_format_t* sf); + void addOutputBuffer(std::vector* output_buffer); + void addOutputFile(sox_format_t* sf); + void addEffect(const std::vector effect); + int64_t getOutputNumChannels(); + int64_t getOutputSampleRate(); +}; + +} // namespace sox_effects_chain +} // namespace torchaudio + diff --git a/paddlespeech/audio/src/sox/io.cpp b/paddlespeech/audio/src/sox/io.cpp index f4dcce475..5a75fc987 100644 --- a/paddlespeech/audio/src/sox/io.cpp +++ b/paddlespeech/audio/src/sox/io.cpp @@ -1,10 +1,10 @@ -// #include "sox/effects.h" -// #include "sox/effects_chain.h" -#include "sox/io.h" -#include "sox/types.h" -#include "sox/utils.h" +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp +#include "paddlespeech/audio/src/sox/effects.h" +#include "paddlespeech/audio/src/sox/effects_chain.h" +#include "paddlespeech/audio/src/sox/io.h" +#include "paddlespeech/audio/src/sox/types.h" +#include "paddlespeech/audio/src/sox/utils.h" -using namespace torch::indexing; using namespace paddleaudio::sox_utils; namespace paddleaudio { @@ -60,7 +60,7 @@ std::vector> get_effects( return effects; } -tl::optional> load_audio_file( +tl::optional> load_audio_file( const std::string& path, const tl::optional& frame_offset, const tl::optional& num_frames, @@ -73,7 +73,7 @@ tl::optional> load_audio_file( } void save_audio_file(const std::string& path, - torch::Tensor tensor, + py::array tensor, int64_t sample_rate, bool channels_first, tl::optional compression, @@ -88,19 +88,19 @@ void save_audio_file(const std::string& path, }(); if (filetype == "amr-nb") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK(num_channels == 1, - "amr-nb format only supports single channel audio."); + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + //TORCH_CHECK(num_channels == 1, + // "amr-nb format only supports single channel audio."); } else if (filetype == "htk") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK(num_channels == 1, - "htk format only supports single channel audio."); + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + // TORCH_CHECK(num_channels == 1, + // "htk format only supports single channel audio."); } else if (filetype == "gsm") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK(num_channels == 1, - "gsm format only supports single channel audio."); - TORCH_CHECK(sample_rate == 8000, - "gsm format only supports a sampling rate of 8kHz."); + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + //TORCH_CHECK(num_channels == 1, + // "gsm format only supports single channel audio."); + //TORCH_CHECK(sample_rate == 8000, + // "gsm format only supports a sampling rate of 8kHz."); } const auto signal_info = get_signalinfo(&tensor, sample_rate, filetype, channels_first); @@ -127,13 +127,5 @@ void save_audio_file(const std::string& path, chain.run(); } -TORCH_LIBRARY_FRAGMENT(paddleaudio, m) { - m.def("paddleaudio::sox_io_get_info", &paddleaudio::sox_io::get_info_file); - m.def("paddleaudio::sox_io_load_audio_file", - &paddleaudio::sox_io::load_audio_file); - m.def("paddleaudio::sox_io_save_audio_file", - &paddleaudio::sox_io::save_audio_file); -} - } // namespace sox_io } // namespace paddleaudio \ No newline at end of file diff --git a/paddlespeech/audio/src/sox/io.h b/paddlespeech/audio/src/sox/io.h index 4464e6655..f8001d872 100644 --- a/paddlespeech/audio/src/sox/io.h +++ b/paddlespeech/audio/src/sox/io.h @@ -2,11 +2,10 @@ // Copyright (c) 2017 Facebook Inc. (Soumith Chintala), // All rights reserved. -#ifndef PADDLEAUDIO_SOX_IO_H -#define PADDLEAUDIO_SOX_IO_H +#pragma once -// #include "sox/utils.h" -#include "optional/optional.hpp" +#include "paddlespeech/audio/src/optional/optional.hpp" +#include "paddlespeech/audio/src/sox/utils.h" namespace paddleaudio { namespace sox_io { @@ -21,7 +20,7 @@ using MetaDataTuple = tl::optional get_info_file( const std::string& path, const tl::optional& format); -tl::optional> load_audio_file( +tl::optional> load_audio_file( const std::string& path, const tl::optional& frame_offset, const tl::optional& num_frames, @@ -30,7 +29,7 @@ tl::optional> load_audio_file( const tl::optional& format); void save_audio_file(const std::string& path, - torch::Tensor tensor, + py::array tensor, int64_t sample_rate, bool channels_first, tl::optional compression, @@ -40,5 +39,3 @@ void save_audio_file(const std::string& path, } // namespace sox_io } // namespace paddleaudio - -#endif \ No newline at end of file diff --git a/paddlespeech/audio/src/sox/types.cpp b/paddlespeech/audio/src/sox/types.cpp new file mode 100644 index 000000000..ab1808be1 --- /dev/null +++ b/paddlespeech/audio/src/sox/types.cpp @@ -0,0 +1,143 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp + +#include "paddlespeech/audio/src/sox/types.h" +#include +#include + +namespace paddleaudio { +namespace sox_utils { + +Format get_format_from_string(const std::string& format) { + if (format == "wav") + return Format::WAV; + if (format == "mp3") + return Format::MP3; + if (format == "flac") + return Format::FLAC; + if (format == "ogg" || format == "vorbis") + return Format::VORBIS; + if (format == "amr-nb") + return Format::AMR_NB; + if (format == "amr-wb") + return Format::AMR_WB; + if (format == "amb") + return Format::AMB; + if (format == "sph") + return Format::SPHERE; + if (format == "htk") + return Format::HTK; + if (format == "gsm") + return Format::GSM; + std::ostringstream stream; + stream << "Internal Error: unexpected format value: " << format; + throw std::runtime_error(stream.str()); +} + +std::string to_string(Encoding v) { + switch (v) { + case Encoding::UNKNOWN: + return "UNKNOWN"; + case Encoding::PCM_SIGNED: + return "PCM_S"; + case Encoding::PCM_UNSIGNED: + return "PCM_U"; + case Encoding::PCM_FLOAT: + return "PCM_F"; + case Encoding::FLAC: + return "FLAC"; + case Encoding::ULAW: + return "ULAW"; + case Encoding::ALAW: + return "ALAW"; + case Encoding::MP3: + return "MP3"; + case Encoding::VORBIS: + return "VORBIS"; + case Encoding::AMR_WB: + return "AMR_WB"; + case Encoding::AMR_NB: + return "AMR_NB"; + case Encoding::OPUS: + return "OPUS"; + default: + throw std::runtime_error("Internal Error: unexpected encoding."); + } +} + +Encoding get_encoding_from_option(const tl::optional encoding) { + if (!encoding.has_value()) + return Encoding::NOT_PROVIDED; + std::string v = encoding.value(); + if (v == "PCM_S") + return Encoding::PCM_SIGNED; + if (v == "PCM_U") + return Encoding::PCM_UNSIGNED; + if (v == "PCM_F") + return Encoding::PCM_FLOAT; + if (v == "ULAW") + return Encoding::ULAW; + if (v == "ALAW") + return Encoding::ALAW; + std::ostringstream stream; + stream << "Internal Error: unexpected encoding value: " << v; + throw std::runtime_error(stream.str()); +} + +BitDepth get_bit_depth_from_option(const tl::optional bit_depth) { + if (!bit_depth.has_value()) + return BitDepth::NOT_PROVIDED; + int64_t v = bit_depth.value(); + switch (v) { + case 8: + return BitDepth::B8; + case 16: + return BitDepth::B16; + case 24: + return BitDepth::B24; + case 32: + return BitDepth::B32; + case 64: + return BitDepth::B64; + default: { + std::ostringstream s; + s << "Internal Error: unexpected bit depth value: " << v; + throw std::runtime_error(s.str()); + } + } +} + +std::string get_encoding(sox_encoding_t encoding) { + switch (encoding) { + case SOX_ENCODING_UNKNOWN: + return "UNKNOWN"; + case SOX_ENCODING_SIGN2: + return "PCM_S"; + case SOX_ENCODING_UNSIGNED: + return "PCM_U"; + case SOX_ENCODING_FLOAT: + return "PCM_F"; + case SOX_ENCODING_FLAC: + return "FLAC"; + case SOX_ENCODING_ULAW: + return "ULAW"; + case SOX_ENCODING_ALAW: + return "ALAW"; + case SOX_ENCODING_MP3: + return "MP3"; + case SOX_ENCODING_VORBIS: + return "VORBIS"; + case SOX_ENCODING_AMR_WB: + return "AMR_WB"; + case SOX_ENCODING_AMR_NB: + return "AMR_NB"; + case SOX_ENCODING_OPUS: + return "OPUS"; + case SOX_ENCODING_GSM: + return "GSM"; + default: + return "UNKNOWN"; + } +} + +} // namespace sox_utils +} // namespace paddleaudio diff --git a/paddlespeech/audio/src/sox/types.h b/paddlespeech/audio/src/sox/types.h new file mode 100644 index 000000000..824c0f632 --- /dev/null +++ b/paddlespeech/audio/src/sox/types.h @@ -0,0 +1,58 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h +#pragma once + +#include +#include "paddlespeech/audio/src/optional/optional.hpp" + +namespace paddleaudio { +namespace sox_utils { + +enum class Format { + WAV, + MP3, + FLAC, + VORBIS, + AMR_NB, + AMR_WB, + AMB, + SPHERE, + GSM, + HTK, +}; + +Format get_format_from_string(const std::string& format); + +enum class Encoding { + NOT_PROVIDED, + UNKNOWN, + PCM_SIGNED, + PCM_UNSIGNED, + PCM_FLOAT, + FLAC, + ULAW, + ALAW, + MP3, + VORBIS, + AMR_WB, + AMR_NB, + OPUS, +}; + +std::string to_string(Encoding v); +Encoding get_encoding_from_option(const tl::optional encoding); + +enum class BitDepth : unsigned { + NOT_PROVIDED = 0, + B8 = 8, + B16 = 16, + B24 = 24, + B32 = 32, + B64 = 64, +}; + +BitDepth get_bit_depth_from_option(const tl::optional bit_depth); + +std::string get_encoding(sox_encoding_t encoding); + +} // namespace sox_utils +} // namespace torchaudio \ No newline at end of file diff --git a/paddlespeech/audio/src/sox/utils.cpp b/paddlespeech/audio/src/sox/utils.cpp new file mode 100644 index 000000000..a44031bb4 --- /dev/null +++ b/paddlespeech/audio/src/sox/utils.cpp @@ -0,0 +1,488 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp + +#include +#include "paddlespeech/audio/src/sox/types.h" +#include "paddlespeech/audio/src/sox/utils.h" + +namespace paddleaudio { +namespace sox_utils { + +void set_seed(const int64_t seed) { + sox_get_globals()->ranqd1 = static_cast(seed); +} + +void set_verbosity(const int64_t verbosity) { + sox_get_globals()->verbosity = static_cast(verbosity); +} + +void set_use_threads(const bool use_threads) { + sox_get_globals()->use_threads = static_cast(use_threads); +} + +void set_buffer_size(const int64_t buffer_size) { + sox_get_globals()->bufsiz = static_cast(buffer_size); +} + +int64_t get_buffer_size() { + return sox_get_globals()->bufsiz; +} + +std::vector> list_effects() { + std::vector> effects; + for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) { + const sox_effect_handler_t* handler = (*fns)(); + if (handler && handler->name) { + if (UNSUPPORTED_EFFECTS.find(handler->name) == + UNSUPPORTED_EFFECTS.end()) { + effects.emplace_back(std::vector{ + handler->name, + handler->usage ? std::string(handler->usage) : std::string("")}); + } + } + } + return effects; +} + +std::vector list_write_formats() { + std::vector formats; + for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { + const sox_format_handler_t* handler = fns->fn(); + for (const char* const* names = handler->names; *names; ++names) { + if (!strchr(*names, '/') && handler->write) + formats.emplace_back(*names); + } + } + return formats; +} + +std::vector list_read_formats() { + std::vector formats; + for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { + const sox_format_handler_t* handler = fns->fn(); + for (const char* const* names = handler->names; *names; ++names) { + if (!strchr(*names, '/') && handler->read) + formats.emplace_back(*names); + } + } + return formats; +} + +SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {} +SoxFormat::~SoxFormat() { + close(); +} + +sox_format_t* SoxFormat::operator->() const noexcept { + return fd_; +} +SoxFormat::operator sox_format_t*() const noexcept { + return fd_; +} + +void SoxFormat::close() { + if (fd_ != nullptr) { + sox_close(fd_); + fd_ = nullptr; + } +} + +void validate_input_file(const SoxFormat& sf, const std::string& path) { + if (static_cast(sf) == nullptr) { + throw std::runtime_error( + "Error loading audio file: failed to open file " + path); + } + if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { + throw std::runtime_error("Error loading audio file: unknown encoding."); + } +} + +void validate_input_memfile(const SoxFormat &sf) { + return validate_input_file(sf, ""); +} + +void validate_input_tensor(const py::array tensor) { + if (tensor.ndim() != 2) { + throw std::runtime_error("Input tensor has to be 2D."); + } + + char dtype = tensor.dtype().char_(); + bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i'); + if (flag == false) { + throw std::runtime_error( + "Input tensor has to be one of float32, int32, int16 or uint8 type."); + } +} + +py::dtype get_dtype( + const sox_encoding_t encoding, + const unsigned precision) { + switch (encoding) { + case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV + return py::dtype('u1'); + case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV + switch (precision) { + case 16: + return py::dtype("i2"); + case 24: // Cast 24-bit to 32-bit. + case 32: + return py::dtype('i'); + default: + throw std::runtime_error( + "Only 16, 24, and 32 bits are supported for signed PCM."); + } + default: + // default to float32 for the other formats, including + // 32-bit flaoting-point WAV, + // MP3, + // FLAC, + // VORBIS etc... + return py::dtype("f"); + } +} + +py::array convert_to_tensor( + sox_sample_t* buffer, + const int32_t num_samples, + const int32_t num_channels, + const py::dtype dtype, + const bool normalize, + const bool channels_first) { + py::array t; + uint64_t dummy = 0; + SOX_SAMPLE_LOCALS; + if (normalize || dtype.char_() == 'f') { + t = py::array(dtype, {num_samples / num_channels, num_channels}); + auto ptr = (float*)t.mutable_data(0, 0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy); + } + } else if (dtype.char_() == 'i') { + //t = torch::from_blob( + // buffer, {num_samples / num_channels, num_channels}, torch::kInt32) + // .clone(); + t = py::array(dtype, {num_samples / num_channels, num_channels}); + auto ptr = (int*)t.mutable_data(0, 0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = buffer[i]; + } + } else if (dtype.char_() == 'h') { // int16 + t = py::array(dtype, {num_samples / num_channels, num_channels}); + auto ptr = (int16_t*)t.mutable_data(0, 0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy); + } + } else if (dtype.char_() == 'b') { + //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8); + auto ptr = (uint8_t*)t.mutable_data(0,0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy); + } + } else { + throw std::runtime_error("Unsupported dtype."); + } + return t; +} + +const std::string get_filetype(const std::string path) { + std::string ext = path.substr(path.find_last_of(".") + 1); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + return ext; +} + +namespace { + +std::tuple get_save_encoding_for_wav( + const std::string format, + py::dtype dtype, + const Encoding& encoding, + const BitDepth& bits_per_sample) { + switch (encoding) { + case Encoding::NOT_PROVIDED: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + switch (dtype.num()) { + case 11: // float32 numpy dtype num + return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); + case 5: // int numpy dtype num + return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); + case 3: // int16 numpy + return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); + case 1: // byte numpy + return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); + default: + throw std::runtime_error("Internal Error: Unexpected dtype."); + } + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); + default: + return std::make_tuple<>( + SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); + } + case Encoding::PCM_SIGNED: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); + case BitDepth::B8: + throw std::runtime_error( + format + " does not support 8-bit signed PCM encoding."); + default: + return std::make_tuple<>( + SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); + } + case Encoding::PCM_UNSIGNED: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); + default: + throw std::runtime_error( + format + " only supports 8-bit for unsigned PCM encoding."); + } + case Encoding::PCM_FLOAT: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B32: + return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); + case BitDepth::B64: + return std::make_tuple<>(SOX_ENCODING_FLOAT, 64); + default: + throw std::runtime_error( + format + + " only supports 32-bit or 64-bit for floating-point PCM encoding."); + } + case Encoding::ULAW: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ULAW, 8); + default: + throw std::runtime_error( + format + " only supports 8-bit for mu-law encoding."); + } + case Encoding::ALAW: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ALAW, 8); + default: + throw std::runtime_error( + format + " only supports 8-bit for a-law encoding."); + } + default: + throw std::runtime_error( + format + " does not support encoding: " + to_string(encoding)); + } +} + +std::tuple get_save_encoding( + const std::string& format, + const py::dtype dtype, + const tl::optional encoding, + const tl::optional bits_per_sample) { + const Format fmt = get_format_from_string(format); + const Encoding enc = get_encoding_from_option(encoding); + const BitDepth bps = get_bit_depth_from_option(bits_per_sample); + + switch (fmt) { + case Format::WAV: + case Format::AMB: + return get_save_encoding_for_wav(format, dtype, enc, bps); + case Format::MP3: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("mp3 does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "mp3 does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_MP3, 16); + case Format::HTK: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("htk does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "htk does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); + case Format::VORBIS: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("vorbis does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "vorbis does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_VORBIS, 16); + case Format::AMR_NB: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("amr-nb does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "amr-nb does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16); + case Format::FLAC: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("flac does not support `encoding` option."); + switch (bps) { + case BitDepth::B32: + case BitDepth::B64: + throw std::runtime_error( + "flac does not support `bits_per_sample` larger than 24."); + default: + return std::make_tuple<>( + SOX_ENCODING_FLAC, static_cast(bps)); + } + case Format::SPHERE: + switch (enc) { + case Encoding::NOT_PROVIDED: + case Encoding::PCM_SIGNED: + switch (bps) { + case BitDepth::NOT_PROVIDED: + return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); + default: + return std::make_tuple<>( + SOX_ENCODING_SIGN2, static_cast(bps)); + } + case Encoding::PCM_UNSIGNED: + throw std::runtime_error( + "sph does not support unsigned integer PCM."); + case Encoding::PCM_FLOAT: + throw std::runtime_error("sph does not support floating point PCM."); + case Encoding::ULAW: + switch (bps) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ULAW, 8); + default: + throw std::runtime_error( + "sph only supports 8-bit for mu-law encoding."); + } + case Encoding::ALAW: + switch (bps) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ALAW, 8); + default: + return std::make_tuple<>( + SOX_ENCODING_ALAW, static_cast(bps)); + } + default: + throw std::runtime_error( + "sph does not support encoding: " + encoding.value()); + } + case Format::GSM: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("gsm does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "gsm does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_GSM, 16); + + default: + throw std::runtime_error("Unsupported format: " + format); + } +} + +unsigned get_precision(const std::string filetype, py::dtype dtype) { + if (filetype == "mp3") + return SOX_UNSPEC; + if (filetype == "flac") + return 24; + if (filetype == "ogg" || filetype == "vorbis") + return SOX_UNSPEC; + if (filetype == "wav" || filetype == "amb") { + switch (dtype.num()) { + case 1: // byte in numpy dype num + return 8; + case 3: // short, in numpy dtype num + return 16; + case 5: // int, numpy dtype + return 32; + case 11: // float, numpy dtype + return 32; + default: + throw std::runtime_error("Unsupported dtype."); + } + } + if (filetype == "sph") + return 32; + if (filetype == "amr-nb") { + return 16; + } + if (filetype == "gsm") { + return 16; + } + if (filetype == "htk") { + return 16; + } + throw std::runtime_error("Unsupported file type: " + filetype); +} + +} // namespace + +sox_signalinfo_t get_signalinfo( + const py::array* waveform, + const int64_t sample_rate, + const std::string filetype, + const bool channels_first) { + return sox_signalinfo_t{ + /*rate=*/static_cast(sample_rate), + /*channels=*/ + static_cast(waveform->shape(channels_first ? 0 : 1)), + /*precision=*/get_precision(filetype, waveform->dtype()), + /*length=*/static_cast(waveform->size())}; +} + +sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) { + sox_encoding_t encoding = [&]() { + switch (dtype.num()) { + case 1: // byte + return SOX_ENCODING_UNSIGNED; + case 3: // short + return SOX_ENCODING_SIGN2; + case 5: // int32 + return SOX_ENCODING_SIGN2; + case 11: // float + return SOX_ENCODING_FLOAT; + default: + throw std::runtime_error("Unsupported dtype."); + } + }(); + unsigned bits_per_sample = [&]() { + switch (dtype.num()) { + case 1: // byte + return 8; + case 3: //short + return 16; + case 5: // int32 + return 32; + case 11: // float + return 32; + default: + throw std::runtime_error("Unsupported dtype."); + } + }(); + return sox_encodinginfo_t{ + /*encoding=*/encoding, + /*bits_per_sample=*/bits_per_sample, + /*compression=*/HUGE_VAL, + /*reverse_bytes=*/sox_option_default, + /*reverse_nibbles=*/sox_option_default, + /*reverse_bits=*/sox_option_default, + /*opposite_endian=*/sox_false}; +} + +sox_encodinginfo_t get_encodinginfo_for_save( + const std::string& format, + const py::dtype dtype, + const tl::optional compression, + const tl::optional encoding, + const tl::optional bits_per_sample) { + auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample); + return sox_encodinginfo_t{ + /*encoding=*/std::get<0>(enc), + /*bits_per_sample=*/std::get<1>(enc), + /*compression=*/compression.value_or(HUGE_VAL), + /*reverse_bytes=*/sox_option_default, + /*reverse_nibbles=*/sox_option_default, + /*reverse_bits=*/sox_option_default, + /*opposite_endian=*/sox_false}; +} + +} // namespace sox_utils +} // namespace torchaudio diff --git a/paddlespeech/audio/src/sox/utils.h b/paddlespeech/audio/src/sox/utils.h new file mode 100644 index 000000000..5b015ece0 --- /dev/null +++ b/paddlespeech/audio/src/sox/utils.h @@ -0,0 +1,120 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h + +#pragma once + +#include +#include +#include + +#include "paddlespeech/audio/src/optional/optional.hpp" + +namespace py = pybind11; + +namespace paddleaudio { +namespace sox_utils { + +//////////////////////////////////////////////////////////////////////////////// +// APIs for Python interaction +//////////////////////////////////////////////////////////////////////////////// + +/// Set sox global options +void set_seed(const int64_t seed); + +void set_verbosity(const int64_t verbosity); + +void set_use_threads(const bool use_threads); + +void set_buffer_size(const int64_t buffer_size); + +int64_t get_buffer_size(); + +std::vector> list_effects(); + +std::vector list_read_formats(); + +std::vector list_write_formats(); + +//////////////////////////////////////////////////////////////////////////////// +// Utilities for sox_io / sox_effects implementations +//////////////////////////////////////////////////////////////////////////////// + +const std::unordered_set UNSUPPORTED_EFFECTS = + {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"}; + +/// helper class to automatically close sox_format_t* +struct SoxFormat { + explicit SoxFormat(sox_format_t* fd) noexcept; + SoxFormat(const SoxFormat& other) = delete; + SoxFormat(SoxFormat&& other) = delete; + SoxFormat& operator=(const SoxFormat& other) = delete; + SoxFormat& operator=(SoxFormat&& other) = delete; + ~SoxFormat(); + sox_format_t* operator->() const noexcept; + operator sox_format_t*() const noexcept; + + void close(); + + private: + sox_format_t* fd_; +}; + +/// +/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32 +void validate_input_tensor(const py::array); + +void validate_input_file(const SoxFormat& sf, const std::string& path); + +void validate_input_memfile(const SoxFormat &sf); +/// +/// Get target dtype for the given encoding and precision. +py::dtype get_dtype( + const sox_encoding_t encoding, + const unsigned precision); + +/// +/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor +/// NOTE: This function might modify the values in the input buffer to +/// reduce the number of memory copy. +/// @param buffer Pointer to buffer that contains audio data. +/// @param num_samples The number of samples to read. +/// @param num_channels The number of channels. Used to reshape the resulting +/// Tensor. +/// @param dtype Target dtype. Determines the output dtype and value range in +/// conjunction with normalization. +/// @param noramlize Perform normalization. Only effective when dtype is not +/// kFloat32. When effective, the output tensor is kFloat32 type and value range +/// is [-1.0, 1.0] +/// @param channels_first When True, output Tensor has shape of [num_channels, +/// num_frames]. +py::array convert_to_tensor( + sox_sample_t* buffer, + const int32_t num_samples, + const int32_t num_channels, + const py::dtype dtype, + const bool normalize, + const bool channels_first); + +/// Extract extension from file path +const std::string get_filetype(const std::string path); + +/// Get sox_signalinfo_t for passing a py::array object. +sox_signalinfo_t get_signalinfo( + const py::array* waveform, + const int64_t sample_rate, + const std::string filetype, + const bool channels_first); + +/// Get sox_encodinginfo_t for Tensor I/O +sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype); + +/// Get sox_encodinginfo_t for saving to file/file object +sox_encodinginfo_t get_encodinginfo_for_save( + const std::string& format, + const py::dtype dtype, + const tl::optional compression, + const tl::optional encoding, + const tl::optional bits_per_sample); + + +} // namespace sox_utils +} // namespace paddleaudio diff --git a/setup.py b/setup.py index 19893c62f..903bba64e 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ base = [ "pypinyin", "pypinyin-dict", "python-dateutil", "pyworld", "resampy==0.2.2", "sacrebleu", "scipy", "sentencepiece~=0.1.96", "soundfile~=0.10", "textgrid", "timer", "tqdm", "typeguard", "visualdl", "webrtcvad", - "yacs~=0.1.8", "prettytable", "zhon", "colorlog", "pathos == 0.2.8" + "yacs~=0.1.8", "prettytable", "zhon", "colorlog", "pathos == 0.2.8", "Ninja" ] server = [ From c938a4688a6c661ea4a41fc214449c8c67b4717c Mon Sep 17 00:00:00 2001 From: YangZhou Date: Fri, 29 Jul 2022 16:03:26 +0800 Subject: [PATCH 02/11] slove link error --- paddlespeech/audio/src/CMakeLists.txt | 11 +- paddlespeech/audio/src/pybind/sox/effects.cpp | 136 +++++ paddlespeech/audio/src/pybind/sox/effects.h | 18 + .../audio/src/pybind/sox/effects_chain.cpp | 330 ++++++++++++ .../audio/src/pybind/sox/effects_chain.h | 53 +- paddlespeech/audio/src/pybind/sox/io.cpp | 72 ++- paddlespeech/audio/src/pybind/sox/io.h | 32 +- paddlespeech/audio/src/pybind/sox/types.cpp | 143 ++++++ paddlespeech/audio/src/pybind/sox/types.h | 58 +++ paddlespeech/audio/src/pybind/sox/utils.cpp | 481 ++++++++++++++++++ paddlespeech/audio/src/pybind/sox/utils.h | 99 +++- 11 files changed, 1416 insertions(+), 17 deletions(-) create mode 100644 paddlespeech/audio/src/pybind/sox/types.cpp create mode 100644 paddlespeech/audio/src/pybind/sox/types.h diff --git a/paddlespeech/audio/src/CMakeLists.txt b/paddlespeech/audio/src/CMakeLists.txt index 7448225ef..4c46fbe24 100644 --- a/paddlespeech/audio/src/CMakeLists.txt +++ b/paddlespeech/audio/src/CMakeLists.txt @@ -35,11 +35,11 @@ if(BUILD_SOX) list( APPEND LIBPADDLEAUDIO_SOURCES - sox/io.cpp - sox/utils.cpp - sox/effects.cpp - sox/effects_chain.cpp - sox/types.cpp + #sox/io.cpp + #sox/utils.cpp + #sox/effects.cpp + #sox/effects_chain.cpp + #sox/types.cpp ) list( APPEND @@ -147,6 +147,7 @@ if(BUILD_SOX) pybind/sox/effects.cpp pybind/sox/effects_chain.cpp pybind/sox/io.cpp + pybind/sox/types.cpp pybind/sox/utils.cpp ) endif() diff --git a/paddlespeech/audio/src/pybind/sox/effects.cpp b/paddlespeech/audio/src/pybind/sox/effects.cpp index 96907a670..b69c5358a 100644 --- a/paddlespeech/audio/src/pybind/sox/effects.cpp +++ b/paddlespeech/audio/src/pybind/sox/effects.cpp @@ -1,3 +1,6 @@ +#include +#include + #include "paddlespeech/audio/src/pybind/sox/effects.h" #include "paddlespeech/audio/src/pybind/sox/effects_chain.h" #include "paddlespeech/audio/src/pybind/sox/utils.h" @@ -118,4 +121,137 @@ auto apply_effects_fileobj( tensor, static_cast(chain.getOutputSampleRate())); } +namespace { + +enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown }; +SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized; +std::mutex SOX_RESOUCE_STATE_MUTEX; + +} // namespace + +void initialize_sox_effects() { + const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); + + switch (SOX_RESOURCE_STATE) { + case NotInitialized: + if (sox_init() != SOX_SUCCESS) { + throw std::runtime_error("Failed to initialize sox effects."); + }; + SOX_RESOURCE_STATE = Initialized; + break; + case Initialized: + break; + case ShutDown: + throw std::runtime_error( + "SoX Effects has been shut down. Cannot initialize again."); + } +}; + +void shutdown_sox_effects() { + const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); + + switch (SOX_RESOURCE_STATE) { + case NotInitialized: + throw std::runtime_error( + "SoX Effects is not initialized. Cannot shutdown."); + case Initialized: + if (sox_quit() != SOX_SUCCESS) { + throw std::runtime_error("Failed to initialize sox effects."); + }; + SOX_RESOURCE_STATE = ShutDown; + break; + case ShutDown: + break; + } +} + +auto apply_effects_tensor( + py::array waveform, + int64_t sample_rate, + const std::vector>& effects, + bool channels_first) -> std::tuple { + validate_input_tensor(waveform); + + // Create SoxEffectsChain + const auto dtype = waveform.dtype(); + paddleaudio::sox_effects_chain::SoxEffectsChain chain( + /*input_encoding=*/get_tensor_encodinginfo(dtype), + /*output_encoding=*/get_tensor_encodinginfo(dtype)); + + // Prepare output buffer + std::vector out_buffer; + out_buffer.reserve(waveform.size()); + + // Build and run effects chain + chain.addInputTensor(&waveform, sample_rate, channels_first); + for (const auto& effect : effects) { + chain.addEffect(effect); + } + chain.addOutputBuffer(&out_buffer); + chain.run(); + + // Create tensor from buffer + auto out_tensor = convert_to_tensor( + /*buffer=*/out_buffer.data(), + /*num_samples=*/out_buffer.size(), + /*num_channels=*/chain.getOutputNumChannels(), + dtype, + /*normalize=*/false, + channels_first); + + return std::tuple( + out_tensor, chain.getOutputSampleRate()); +} + +auto apply_effects_file( + const std::string& path, + const std::vector>& effects, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format) + -> tl::optional> { + // Open input file + SoxFormat sf(sox_open_read( + path.c_str(), + /*signal=*/nullptr, + /*encoding=*/nullptr, + /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); + + if (static_cast(sf) == nullptr || + sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { + return {}; + } + + const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision); + + // Prepare output + std::vector out_buffer; + out_buffer.reserve(sf->signal.length); + + // Create and run SoxEffectsChain + paddleaudio::sox_effects_chain::SoxEffectsChain chain( + /*input_encoding=*/sf->encoding, + /*output_encoding=*/get_tensor_encodinginfo(dtype)); + + chain.addInputFile(sf); + for (const auto& effect : effects) { + chain.addEffect(effect); + } + chain.addOutputBuffer(&out_buffer); + chain.run(); + + // Create tensor from buffer + bool channels_first_ = channels_first.value_or(true); + auto tensor = convert_to_tensor( + /*buffer=*/out_buffer.data(), + /*num_samples=*/out_buffer.size(), + /*num_channels=*/chain.getOutputNumChannels(), + dtype, + normalize.value_or(true), + channels_first_); + + return std::tuple( + tensor, chain.getOutputSampleRate()); +} + } // namespace paddleaudio::sox_effects diff --git a/paddlespeech/audio/src/pybind/sox/effects.h b/paddlespeech/audio/src/pybind/sox/effects.h index 5e67cb011..6ba53d008 100644 --- a/paddlespeech/audio/src/pybind/sox/effects.h +++ b/paddlespeech/audio/src/pybind/sox/effects.h @@ -15,4 +15,22 @@ auto apply_effects_fileobj( tl::optional format) -> tl::optional>; +void initialize_sox_effects(); + +void shutdown_sox_effects(); + +auto apply_effects_tensor( + py::array waveform, + int64_t sample_rate, + const std::vector>& effects, + bool channels_first) -> std::tuple; + +auto apply_effects_file( + const std::string& path, + const std::vector>& effects, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format) + -> tl::optional>; + } // namespace paddleaudio::sox_effects diff --git a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp index a106209d6..4ad90da36 100644 --- a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp +++ b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp @@ -9,6 +9,336 @@ namespace paddleaudio::sox_effects_chain { namespace { +/// helper classes for passing the location of input tensor and output buffer +/// +/// drain/flow callback functions require plaing C style function signature and +/// the way to pass extra data is to attach data to sox_effect_t::priv pointer. +/// The following structs will be assigned to sox_effect_t::priv pointer which +/// gives sox_effect_t an access to input Tensor and output buffer object. +struct TensorInputPriv { + size_t index; + py::array* waveform; + int64_t sample_rate; + bool channels_first; +}; + +struct TensorOutputPriv { + std::vector* buffer; +}; +struct FileOutputPriv { + sox_format_t* sf; +}; + +/// Callback function to feed Tensor data to SoxEffectChain. +int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { + // Retrieve the input Tensor and current index + auto priv = static_cast(effp->priv); + auto index = priv->index; + auto tensor = *(priv->waveform); + auto num_channels = effp->out_signal.channels; + + // Adjust the number of samples to read + const size_t num_samples = tensor.size(); + if (index + *osamp > num_samples) { + *osamp = num_samples - index; + } + // Ensure that it's a multiple of the number of channels + *osamp -= *osamp % num_channels; + + // Slice the input Tensor + // refacor this module, chunk + auto i_frame = index / num_channels; + auto num_frames = *osamp / num_channels; + py::array chunk(tensor.dtype(), {num_frames*num_channels}); + py::buffer_info ori_info = tensor.request(); + py::buffer_info info = chunk.request(); + char* ori_start_ptr = (char*)ori_info.ptr + index * chunk.itemsize() / sizeof(char); + std::memcpy(info.ptr, ori_start_ptr, chunk.nbytes()); + + py::dtype chunk_type = py::dtype("i"); // dtype int32 + py::array new_chunk = py::array(chunk_type, chunk.shape()); + py::buffer_info new_info = new_chunk.request(); + void* ptr = (void*) info.ptr; + int* new_ptr = (int*) new_info.ptr; + // Convert to sox_sample_t (int32_t) + switch (chunk.dtype().num()) { + //case c10::ScalarType::Float: { + case 11: { + // Need to convert to 64-bit precision so that + // values around INT32_MIN/MAX are handled correctly. + float* ptr_f = (float*)ptr; + for (int idx = 0; idx < chunk.size(); ++idx) { + double elem = *ptr_f * 2147483648.; + // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX); + if (elem > INT32_MAX) { + *new_ptr = INT32_MAX; + } else if (elem < INT32_MIN) { + *new_ptr = INT32_MIN; + } else { *new_ptr = elem; } + } + break; + } + //case c10::ScalarType::Int: { + case 5: { + break; + } + // case short + case 3: { + int16_t* ptr_s = (int16_t*) ptr; + for (int idx = 0; idx < chunk.size(); ++idx) { + *new_ptr = *ptr_s * 65536; + } + break; + } + // case byte + case 1: { + int8_t* ptr_b = (int8_t*) ptr; + for (int idx = 0; idx < chunk.size(); ++idx) { + *new_ptr = (*ptr_b - 128) * 16777216; + } + break; + } + default: + throw std::runtime_error("Unexpected dtype."); + } + // Write to buffer + memcpy(obuf, (int*)new_info.ptr, *osamp * 4); + priv->index += *osamp; + return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS; +} + +/// Callback function to fetch data from SoxEffectChain. +int tensor_output_flow( + sox_effect_t* effp, + sox_sample_t const* ibuf, + sox_sample_t* obuf LSX_UNUSED, + size_t* isamp, + size_t* osamp) { + *osamp = 0; + // Get output buffer + auto out_buffer = static_cast(effp->priv)->buffer; + // Append at the end + out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp); + return SOX_SUCCESS; +} + +int file_output_flow( + sox_effect_t* effp, + sox_sample_t const* ibuf, + sox_sample_t* obuf LSX_UNUSED, + size_t* isamp, + size_t* osamp) { + *osamp = 0; + if (*isamp) { + auto sf = static_cast(effp->priv)->sf; + if (sox_write(sf, ibuf, *isamp) != *isamp) { + if (sf->sox_errno) { + std::ostringstream stream; + stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " " + << sf->filename; + throw std::runtime_error(stream.str()); + } + return SOX_EOF; + } + } + return SOX_SUCCESS; +} + +sox_effect_handler_t* get_tensor_input_handler() { + static sox_effect_handler_t handler{ + /*name=*/"input_tensor", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/NULL, + /*drain=*/tensor_input_drain, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(TensorInputPriv)}; + return &handler; +} + +sox_effect_handler_t* get_tensor_output_handler() { + static sox_effect_handler_t handler{ + /*name=*/"output_tensor", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/tensor_output_flow, + /*drain=*/NULL, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(TensorOutputPriv)}; + return &handler; +} + +sox_effect_handler_t* get_file_output_handler() { + static sox_effect_handler_t handler{ + /*name=*/"output_file", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/file_output_flow, + /*drain=*/NULL, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(FileOutputPriv)}; + return &handler; +} + +} // namespace + +SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {} + +SoxEffect::~SoxEffect() { + if (se_ != nullptr) { + free(se_); + } +} + +SoxEffect::operator sox_effect_t*() const { + return se_; +} + +auto SoxEffect::operator->() noexcept -> sox_effect_t* { + return se_; +} + +SoxEffectsChain::SoxEffectsChain( + sox_encodinginfo_t input_encoding, + sox_encodinginfo_t output_encoding) + : in_enc_(input_encoding), + out_enc_(output_encoding), + in_sig_(), + interm_sig_(), + out_sig_(), + sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) { + if (!sec_) { + throw std::runtime_error("Failed to create effect chain."); + } +} + +SoxEffectsChain::~SoxEffectsChain() { + if (sec_ != nullptr) { + sox_delete_effects_chain(sec_); + } +} + +void SoxEffectsChain::run() { + sox_flow_effects(sec_, NULL, NULL); +} + +void SoxEffectsChain::addInputTensor( + py::array* waveform, + int64_t sample_rate, + bool channels_first) { + in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first); + interm_sig_ = in_sig_; + SoxEffect e(sox_create_effect(get_tensor_input_handler())); + auto priv = static_cast(e->priv); + priv->index = 0; + priv->waveform = waveform; + priv->sample_rate = sample_rate; + priv->channels_first = channels_first; + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + throw std::runtime_error( + "Internal Error: Failed to add effect: input_tensor"); + } +} + +void SoxEffectsChain::addOutputBuffer( + std::vector* output_buffer) { + SoxEffect e(sox_create_effect(get_tensor_output_handler())); + static_cast(e->priv)->buffer = output_buffer; + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + throw std::runtime_error( + "Internal Error: Failed to add effect: output_tensor"); + } +} + +void SoxEffectsChain::addInputFile(sox_format_t* sf) { + in_sig_ = sf->signal; + interm_sig_ = in_sig_; + SoxEffect e(sox_create_effect(sox_find_effect("input"))); + char* opts[] = {(char*)sf}; + sox_effect_options(e, 1, opts); + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Internal Error: Failed to add effect: input " << sf->filename; + throw std::runtime_error(stream.str()); + } +} + +void SoxEffectsChain::addOutputFile(sox_format_t* sf) { + out_sig_ = sf->signal; + SoxEffect e(sox_create_effect(get_file_output_handler())); + static_cast(e->priv)->sf = sf; + if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Internal Error: Failed to add effect: output " << sf->filename; + throw std::runtime_error(stream.str()); + } +} + +void SoxEffectsChain::addEffect(const std::vector effect) { + const auto num_args = effect.size(); + if (num_args == 0) { + throw std::runtime_error("Invalid argument: empty effect."); + } + const auto name = effect[0]; + if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) { + std::ostringstream stream; + stream << "Unsupported effect: " << name; + throw std::runtime_error(stream.str()); + } + + auto returned_effect = sox_find_effect(name.c_str()); + if (!returned_effect) { + std::ostringstream stream; + stream << "Unsupported effect: " << name; + throw std::runtime_error(stream.str()); + } + SoxEffect e(sox_create_effect(returned_effect)); + const auto num_options = num_args - 1; + + std::vector opts; + for (size_t i = 1; i < num_args; ++i) { + opts.push_back((char*)effect[i].c_str()); + } + if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) != + SOX_SUCCESS) { + std::ostringstream stream; + stream << "Invalid effect option:"; + for (const auto& v : effect) { + stream << " " << v; + } + throw std::runtime_error(stream.str()); + } + + if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Internal Error: Failed to add effect: \"" << name; + for (size_t i = 1; i < num_args; ++i) { + stream << " " << effect[i]; + } + stream << "\""; + throw std::runtime_error(stream.str()); + } +} + +int64_t SoxEffectsChain::getOutputNumChannels() { + return interm_sig_.channels; +} + +int64_t SoxEffectsChain::getOutputSampleRate() { + return interm_sig_.rate; +} + +namespace { + /// helper classes for passing file-like object to SoxEffectChain struct FileObjInputPriv { sox_format_t* sf; diff --git a/paddlespeech/audio/src/pybind/sox/effects_chain.h b/paddlespeech/audio/src/pybind/sox/effects_chain.h index 3de0161e3..6fb994b5a 100644 --- a/paddlespeech/audio/src/pybind/sox/effects_chain.h +++ b/paddlespeech/audio/src/pybind/sox/effects_chain.h @@ -1,9 +1,60 @@ #pragma once -#include "paddlespeech/audio/src/sox/effects_chain.h" +#include +#include "paddlespeech/audio/src/pybind/sox/utils.h" namespace paddleaudio::sox_effects_chain { +// Helper struct to safely close sox_effect_t* pointer returned by +// sox_create_effect + +struct SoxEffect { + explicit SoxEffect(sox_effect_t* se) noexcept; + SoxEffect(const SoxEffect& other) = delete; + SoxEffect(const SoxEffect&& other) = delete; + auto operator=(const SoxEffect& other) -> SoxEffect& = delete; + auto operator=(SoxEffect&& other) -> SoxEffect& = delete; + ~SoxEffect(); + operator sox_effect_t*() const; + auto operator->() noexcept -> sox_effect_t*; + + private: + sox_effect_t* se_; +}; + +// Helper struct to safely close sox_effects_chain_t with handy methods +class SoxEffectsChain { + const sox_encodinginfo_t in_enc_; + const sox_encodinginfo_t out_enc_; + + protected: + sox_signalinfo_t in_sig_; + sox_signalinfo_t interm_sig_; + sox_signalinfo_t out_sig_; + sox_effects_chain_t* sec_; + + public: + explicit SoxEffectsChain( + sox_encodinginfo_t input_encoding, + sox_encodinginfo_t output_encoding); + SoxEffectsChain(const SoxEffectsChain& other) = delete; + SoxEffectsChain(const SoxEffectsChain&& other) = delete; + SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete; + SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete; + ~SoxEffectsChain(); + void run(); + void addInputTensor( + py::array* waveform, + int64_t sample_rate, + bool channels_first); + void addInputFile(sox_format_t* sf); + void addOutputBuffer(std::vector* output_buffer); + void addOutputFile(sox_format_t* sf); + void addEffect(const std::vector effect); + int64_t getOutputNumChannels(); + int64_t getOutputSampleRate(); +}; + class SoxEffectsChainPyBind : public SoxEffectsChain { using SoxEffectsChain::SoxEffectsChain; diff --git a/paddlespeech/audio/src/pybind/sox/io.cpp b/paddlespeech/audio/src/pybind/sox/io.cpp index 6e3230f27..4c27e6aab 100644 --- a/paddlespeech/audio/src/pybind/sox/io.cpp +++ b/paddlespeech/audio/src/pybind/sox/io.cpp @@ -3,14 +3,11 @@ #include "paddlespeech/audio/src/pybind/sox/io.h" #include "paddlespeech/audio/src/pybind/sox/effects.h" +#include "paddlespeech/audio/src/pybind/sox/types.h" #include "paddlespeech/audio/src/pybind/sox/effects_chain.h" #include "paddlespeech/audio/src/pybind/sox/utils.h" #include "paddlespeech/audio/src/optional/optional.hpp" -#include "paddlespeech/audio/src/sox/io.h" -#include "paddlespeech/audio/src/sox/types.h" -#include "paddlespeech/audio/src/sox/utils.h" - using namespace paddleaudio::sox_utils; namespace paddleaudio { @@ -108,6 +105,73 @@ tl::optional> load_audio_fileobj( std::move(fileobj), effects, normalize, channels_first, std::move(format)); } +tl::optional> load_audio_file( + const std::string& path, + const tl::optional& frame_offset, + const tl::optional& num_frames, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format) { + auto effects = get_effects(frame_offset, num_frames); + return paddleaudio::sox_effects::apply_effects_file( + path, effects, normalize, channels_first, format); +} + +void save_audio_file(const std::string& path, + py::array tensor, + int64_t sample_rate, + bool channels_first, + tl::optional compression, + tl::optional format, + tl::optional encoding, + tl::optional bits_per_sample) { + validate_input_tensor(tensor); + + const auto filetype = [&]() { + if (format.has_value()) return format.value(); + return get_filetype(path); + }(); + + if (filetype == "amr-nb") { + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + //TORCH_CHECK(num_channels == 1, + // "amr-nb format only supports single channel audio."); + } else if (filetype == "htk") { + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + // TORCH_CHECK(num_channels == 1, + // "htk format only supports single channel audio."); + } else if (filetype == "gsm") { + const auto num_channels = tensor.shape(channels_first ? 0 : 1); + //TORCH_CHECK(num_channels == 1, + // "gsm format only supports single channel audio."); + //TORCH_CHECK(sample_rate == 8000, + // "gsm format only supports a sampling rate of 8kHz."); + } + const auto signal_info = + get_signalinfo(&tensor, sample_rate, filetype, channels_first); + const auto encoding_info = get_encodinginfo_for_save( + filetype, tensor.dtype(), compression, encoding, bits_per_sample); + + SoxFormat sf(sox_open_write(path.c_str(), + &signal_info, + &encoding_info, + /*filetype=*/filetype.c_str(), + /*oob=*/nullptr, + /*overwrite_permitted=*/nullptr)); + + if (static_cast(sf) == nullptr) { + throw std::runtime_error( + "Error saving audio file: failed to open file " + path); + } + + paddleaudio::sox_effects_chain::SoxEffectsChain chain( + /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()), + /*output_encoding=*/sf->encoding); + chain.addInputTensor(&tensor, sample_rate, channels_first); + chain.addOutputFile(sf); + chain.run(); +} + namespace { // helper class to automatically release buffer, to be used by // save_audio_fileobj diff --git a/paddlespeech/audio/src/pybind/sox/io.h b/paddlespeech/audio/src/pybind/sox/io.h index ca03b5db3..94ce18f22 100644 --- a/paddlespeech/audio/src/pybind/sox/io.h +++ b/paddlespeech/audio/src/pybind/sox/io.h @@ -16,14 +16,13 @@ auto get_info_file(const std::string &path, const std::string &format) auto get_info_fileobj(py::object fileobj, const std::string &format) -> std::tuple; -auto load_audio_fileobj( +tl::optional> load_audio_fileobj( py::object fileobj, - tl::optional frame_offset, - tl::optional num_frames, + const tl::optional& frame_offset, + const tl::optional& num_frames, tl::optional normalize, tl::optional channels_first, - tl::optional format) - -> tl::optional>; + const tl::optional& format); void save_audio_fileobj( py::object fileobj, @@ -35,5 +34,28 @@ void save_audio_fileobj( tl::optional encoding, tl::optional bits_per_sample); +auto get_effects(const tl::optional& frame_offset, + const tl::optional& num_frames) + -> std::vector>; + + +tl::optional> load_audio_file( + const std::string& path, + const tl::optional& frame_offset, + const tl::optional& num_frames, + tl::optional normalize, + tl::optional channels_first, + const tl::optional& format); + +void save_audio_file(const std::string& path, + py::array tensor, + int64_t sample_rate, + bool channels_first, + tl::optional compression, + tl::optional format, + tl::optional encoding, + tl::optional bits_per_sample); + + } // namespace paddleaudio } // namespace sox_io diff --git a/paddlespeech/audio/src/pybind/sox/types.cpp b/paddlespeech/audio/src/pybind/sox/types.cpp new file mode 100644 index 000000000..8e3e61373 --- /dev/null +++ b/paddlespeech/audio/src/pybind/sox/types.cpp @@ -0,0 +1,143 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp + +#include "paddlespeech/audio/src/pybind/sox/types.h" +#include +#include + +namespace paddleaudio { +namespace sox_utils { + +Format get_format_from_string(const std::string& format) { + if (format == "wav") + return Format::WAV; + if (format == "mp3") + return Format::MP3; + if (format == "flac") + return Format::FLAC; + if (format == "ogg" || format == "vorbis") + return Format::VORBIS; + if (format == "amr-nb") + return Format::AMR_NB; + if (format == "amr-wb") + return Format::AMR_WB; + if (format == "amb") + return Format::AMB; + if (format == "sph") + return Format::SPHERE; + if (format == "htk") + return Format::HTK; + if (format == "gsm") + return Format::GSM; + std::ostringstream stream; + stream << "Internal Error: unexpected format value: " << format; + throw std::runtime_error(stream.str()); +} + +std::string to_string(Encoding v) { + switch (v) { + case Encoding::UNKNOWN: + return "UNKNOWN"; + case Encoding::PCM_SIGNED: + return "PCM_S"; + case Encoding::PCM_UNSIGNED: + return "PCM_U"; + case Encoding::PCM_FLOAT: + return "PCM_F"; + case Encoding::FLAC: + return "FLAC"; + case Encoding::ULAW: + return "ULAW"; + case Encoding::ALAW: + return "ALAW"; + case Encoding::MP3: + return "MP3"; + case Encoding::VORBIS: + return "VORBIS"; + case Encoding::AMR_WB: + return "AMR_WB"; + case Encoding::AMR_NB: + return "AMR_NB"; + case Encoding::OPUS: + return "OPUS"; + default: + throw std::runtime_error("Internal Error: unexpected encoding."); + } +} + +Encoding get_encoding_from_option(const tl::optional encoding) { + if (!encoding.has_value()) + return Encoding::NOT_PROVIDED; + std::string v = encoding.value(); + if (v == "PCM_S") + return Encoding::PCM_SIGNED; + if (v == "PCM_U") + return Encoding::PCM_UNSIGNED; + if (v == "PCM_F") + return Encoding::PCM_FLOAT; + if (v == "ULAW") + return Encoding::ULAW; + if (v == "ALAW") + return Encoding::ALAW; + std::ostringstream stream; + stream << "Internal Error: unexpected encoding value: " << v; + throw std::runtime_error(stream.str()); +} + +BitDepth get_bit_depth_from_option(const tl::optional bit_depth) { + if (!bit_depth.has_value()) + return BitDepth::NOT_PROVIDED; + int64_t v = bit_depth.value(); + switch (v) { + case 8: + return BitDepth::B8; + case 16: + return BitDepth::B16; + case 24: + return BitDepth::B24; + case 32: + return BitDepth::B32; + case 64: + return BitDepth::B64; + default: { + std::ostringstream s; + s << "Internal Error: unexpected bit depth value: " << v; + throw std::runtime_error(s.str()); + } + } +} + +std::string get_encoding(sox_encoding_t encoding) { + switch (encoding) { + case SOX_ENCODING_UNKNOWN: + return "UNKNOWN"; + case SOX_ENCODING_SIGN2: + return "PCM_S"; + case SOX_ENCODING_UNSIGNED: + return "PCM_U"; + case SOX_ENCODING_FLOAT: + return "PCM_F"; + case SOX_ENCODING_FLAC: + return "FLAC"; + case SOX_ENCODING_ULAW: + return "ULAW"; + case SOX_ENCODING_ALAW: + return "ALAW"; + case SOX_ENCODING_MP3: + return "MP3"; + case SOX_ENCODING_VORBIS: + return "VORBIS"; + case SOX_ENCODING_AMR_WB: + return "AMR_WB"; + case SOX_ENCODING_AMR_NB: + return "AMR_NB"; + case SOX_ENCODING_OPUS: + return "OPUS"; + case SOX_ENCODING_GSM: + return "GSM"; + default: + return "UNKNOWN"; + } +} + +} // namespace sox_utils +} // namespace paddleaudio diff --git a/paddlespeech/audio/src/pybind/sox/types.h b/paddlespeech/audio/src/pybind/sox/types.h new file mode 100644 index 000000000..824c0f632 --- /dev/null +++ b/paddlespeech/audio/src/pybind/sox/types.h @@ -0,0 +1,58 @@ +//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h +#pragma once + +#include +#include "paddlespeech/audio/src/optional/optional.hpp" + +namespace paddleaudio { +namespace sox_utils { + +enum class Format { + WAV, + MP3, + FLAC, + VORBIS, + AMR_NB, + AMR_WB, + AMB, + SPHERE, + GSM, + HTK, +}; + +Format get_format_from_string(const std::string& format); + +enum class Encoding { + NOT_PROVIDED, + UNKNOWN, + PCM_SIGNED, + PCM_UNSIGNED, + PCM_FLOAT, + FLAC, + ULAW, + ALAW, + MP3, + VORBIS, + AMR_WB, + AMR_NB, + OPUS, +}; + +std::string to_string(Encoding v); +Encoding get_encoding_from_option(const tl::optional encoding); + +enum class BitDepth : unsigned { + NOT_PROVIDED = 0, + B8 = 8, + B16 = 16, + B24 = 24, + B32 = 32, + B64 = 64, +}; + +BitDepth get_bit_depth_from_option(const tl::optional bit_depth); + +std::string get_encoding(sox_encoding_t encoding); + +} // namespace sox_utils +} // namespace torchaudio \ No newline at end of file diff --git a/paddlespeech/audio/src/pybind/sox/utils.cpp b/paddlespeech/audio/src/pybind/sox/utils.cpp index 24a2817d2..a930f8cdd 100644 --- a/paddlespeech/audio/src/pybind/sox/utils.cpp +++ b/paddlespeech/audio/src/pybind/sox/utils.cpp @@ -1,7 +1,9 @@ // Copyright (c) 2017 Facebook Inc. (Soumith Chintala), // All rights reserved. +#include #include "paddlespeech/audio/src/pybind/sox/utils.h" +#include "paddlespeech/audio/src/pybind/sox/types.h" #include @@ -35,6 +37,485 @@ auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer) return num_read; } + +void set_seed(const int64_t seed) { + sox_get_globals()->ranqd1 = static_cast(seed); +} + +void set_verbosity(const int64_t verbosity) { + sox_get_globals()->verbosity = static_cast(verbosity); +} + +void set_use_threads(const bool use_threads) { + sox_get_globals()->use_threads = static_cast(use_threads); +} + +void set_buffer_size(const int64_t buffer_size) { + sox_get_globals()->bufsiz = static_cast(buffer_size); +} + +int64_t get_buffer_size() { + return sox_get_globals()->bufsiz; +} + +std::vector> list_effects() { + std::vector> effects; + for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) { + const sox_effect_handler_t* handler = (*fns)(); + if (handler && handler->name) { + if (UNSUPPORTED_EFFECTS.find(handler->name) == + UNSUPPORTED_EFFECTS.end()) { + effects.emplace_back(std::vector{ + handler->name, + handler->usage ? std::string(handler->usage) : std::string("")}); + } + } + } + return effects; +} + +std::vector list_write_formats() { + std::vector formats; + for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { + const sox_format_handler_t* handler = fns->fn(); + for (const char* const* names = handler->names; *names; ++names) { + if (!strchr(*names, '/') && handler->write) + formats.emplace_back(*names); + } + } + return formats; +} + +std::vector list_read_formats() { + std::vector formats; + for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { + const sox_format_handler_t* handler = fns->fn(); + for (const char* const* names = handler->names; *names; ++names) { + if (!strchr(*names, '/') && handler->read) + formats.emplace_back(*names); + } + } + return formats; +} + +SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {} +SoxFormat::~SoxFormat() { + close(); +} + +sox_format_t* SoxFormat::operator->() const noexcept { + return fd_; +} +SoxFormat::operator sox_format_t*() const noexcept { + return fd_; +} + +void SoxFormat::close() { + if (fd_ != nullptr) { + sox_close(fd_); + fd_ = nullptr; + } +} + +void validate_input_file(const SoxFormat& sf, const std::string& path) { + if (static_cast(sf) == nullptr) { + throw std::runtime_error( + "Error loading audio file: failed to open file " + path); + } + if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { + throw std::runtime_error("Error loading audio file: unknown encoding."); + } +} + +void validate_input_memfile(const SoxFormat &sf) { + return validate_input_file(sf, ""); +} + +void validate_input_tensor(const py::array tensor) { + if (tensor.ndim() != 2) { + throw std::runtime_error("Input tensor has to be 2D."); + } + + char dtype = tensor.dtype().char_(); + bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i'); + if (flag == false) { + throw std::runtime_error( + "Input tensor has to be one of float32, int32, int16 or uint8 type."); + } +} + +py::dtype get_dtype( + const sox_encoding_t encoding, + const unsigned precision) { + switch (encoding) { + case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV + return py::dtype('u1'); + case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV + switch (precision) { + case 16: + return py::dtype("i2"); + case 24: // Cast 24-bit to 32-bit. + case 32: + return py::dtype('i'); + default: + throw std::runtime_error( + "Only 16, 24, and 32 bits are supported for signed PCM."); + } + default: + // default to float32 for the other formats, including + // 32-bit flaoting-point WAV, + // MP3, + // FLAC, + // VORBIS etc... + return py::dtype("f"); + } +} + +py::array convert_to_tensor( + sox_sample_t* buffer, + const int32_t num_samples, + const int32_t num_channels, + const py::dtype dtype, + const bool normalize, + const bool channels_first) { + py::array t; + uint64_t dummy = 0; + SOX_SAMPLE_LOCALS; + if (normalize || dtype.char_() == 'f') { + t = py::array(dtype, {num_samples / num_channels, num_channels}); + auto ptr = (float*)t.mutable_data(0, 0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy); + } + } else if (dtype.char_() == 'i') { + //t = torch::from_blob( + // buffer, {num_samples / num_channels, num_channels}, torch::kInt32) + // .clone(); + t = py::array(dtype, {num_samples / num_channels, num_channels}); + auto ptr = (int*)t.mutable_data(0, 0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = buffer[i]; + } + } else if (dtype.char_() == 'h') { // int16 + t = py::array(dtype, {num_samples / num_channels, num_channels}); + auto ptr = (int16_t*)t.mutable_data(0, 0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy); + } + } else if (dtype.char_() == 'b') { + //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8); + auto ptr = (uint8_t*)t.mutable_data(0,0); + for (int32_t i = 0; i < num_samples; ++i) { + ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy); + } + } else { + throw std::runtime_error("Unsupported dtype."); + } + return t; +} + +const std::string get_filetype(const std::string path) { + std::string ext = path.substr(path.find_last_of(".") + 1); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + return ext; +} + +namespace { + +std::tuple get_save_encoding_for_wav( + const std::string format, + py::dtype dtype, + const Encoding& encoding, + const BitDepth& bits_per_sample) { + switch (encoding) { + case Encoding::NOT_PROVIDED: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + switch (dtype.num()) { + case 11: // float32 numpy dtype num + return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); + case 5: // int numpy dtype num + return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); + case 3: // int16 numpy + return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); + case 1: // byte numpy + return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); + default: + throw std::runtime_error("Internal Error: Unexpected dtype."); + } + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); + default: + return std::make_tuple<>( + SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); + } + case Encoding::PCM_SIGNED: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); + case BitDepth::B8: + throw std::runtime_error( + format + " does not support 8-bit signed PCM encoding."); + default: + return std::make_tuple<>( + SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); + } + case Encoding::PCM_UNSIGNED: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); + default: + throw std::runtime_error( + format + " only supports 8-bit for unsigned PCM encoding."); + } + case Encoding::PCM_FLOAT: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B32: + return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); + case BitDepth::B64: + return std::make_tuple<>(SOX_ENCODING_FLOAT, 64); + default: + throw std::runtime_error( + format + + " only supports 32-bit or 64-bit for floating-point PCM encoding."); + } + case Encoding::ULAW: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ULAW, 8); + default: + throw std::runtime_error( + format + " only supports 8-bit for mu-law encoding."); + } + case Encoding::ALAW: + switch (bits_per_sample) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ALAW, 8); + default: + throw std::runtime_error( + format + " only supports 8-bit for a-law encoding."); + } + default: + throw std::runtime_error( + format + " does not support encoding: " + to_string(encoding)); + } +} + +std::tuple get_save_encoding( + const std::string& format, + const py::dtype dtype, + const tl::optional encoding, + const tl::optional bits_per_sample) { + const Format fmt = get_format_from_string(format); + const Encoding enc = get_encoding_from_option(encoding); + const BitDepth bps = get_bit_depth_from_option(bits_per_sample); + + switch (fmt) { + case Format::WAV: + case Format::AMB: + return get_save_encoding_for_wav(format, dtype, enc, bps); + case Format::MP3: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("mp3 does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "mp3 does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_MP3, 16); + case Format::HTK: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("htk does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "htk does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); + case Format::VORBIS: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("vorbis does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "vorbis does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_VORBIS, 16); + case Format::AMR_NB: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("amr-nb does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "amr-nb does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16); + case Format::FLAC: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("flac does not support `encoding` option."); + switch (bps) { + case BitDepth::B32: + case BitDepth::B64: + throw std::runtime_error( + "flac does not support `bits_per_sample` larger than 24."); + default: + return std::make_tuple<>( + SOX_ENCODING_FLAC, static_cast(bps)); + } + case Format::SPHERE: + switch (enc) { + case Encoding::NOT_PROVIDED: + case Encoding::PCM_SIGNED: + switch (bps) { + case BitDepth::NOT_PROVIDED: + return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); + default: + return std::make_tuple<>( + SOX_ENCODING_SIGN2, static_cast(bps)); + } + case Encoding::PCM_UNSIGNED: + throw std::runtime_error( + "sph does not support unsigned integer PCM."); + case Encoding::PCM_FLOAT: + throw std::runtime_error("sph does not support floating point PCM."); + case Encoding::ULAW: + switch (bps) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ULAW, 8); + default: + throw std::runtime_error( + "sph only supports 8-bit for mu-law encoding."); + } + case Encoding::ALAW: + switch (bps) { + case BitDepth::NOT_PROVIDED: + case BitDepth::B8: + return std::make_tuple<>(SOX_ENCODING_ALAW, 8); + default: + return std::make_tuple<>( + SOX_ENCODING_ALAW, static_cast(bps)); + } + default: + throw std::runtime_error( + "sph does not support encoding: " + encoding.value()); + } + case Format::GSM: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("gsm does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "gsm does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_GSM, 16); + + default: + throw std::runtime_error("Unsupported format: " + format); + } +} + +unsigned get_precision(const std::string filetype, py::dtype dtype) { + if (filetype == "mp3") + return SOX_UNSPEC; + if (filetype == "flac") + return 24; + if (filetype == "ogg" || filetype == "vorbis") + return SOX_UNSPEC; + if (filetype == "wav" || filetype == "amb") { + switch (dtype.num()) { + case 1: // byte in numpy dype num + return 8; + case 3: // short, in numpy dtype num + return 16; + case 5: // int, numpy dtype + return 32; + case 11: // float, numpy dtype + return 32; + default: + throw std::runtime_error("Unsupported dtype."); + } + } + if (filetype == "sph") + return 32; + if (filetype == "amr-nb") { + return 16; + } + if (filetype == "gsm") { + return 16; + } + if (filetype == "htk") { + return 16; + } + throw std::runtime_error("Unsupported file type: " + filetype); +} + +} // namespace + +sox_signalinfo_t get_signalinfo( + const py::array* waveform, + const int64_t sample_rate, + const std::string filetype, + const bool channels_first) { + return sox_signalinfo_t{ + /*rate=*/static_cast(sample_rate), + /*channels=*/ + static_cast(waveform->shape(channels_first ? 0 : 1)), + /*precision=*/get_precision(filetype, waveform->dtype()), + /*length=*/static_cast(waveform->size())}; +} + +sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) { + sox_encoding_t encoding = [&]() { + switch (dtype.num()) { + case 1: // byte + return SOX_ENCODING_UNSIGNED; + case 3: // short + return SOX_ENCODING_SIGN2; + case 5: // int32 + return SOX_ENCODING_SIGN2; + case 11: // float + return SOX_ENCODING_FLOAT; + default: + throw std::runtime_error("Unsupported dtype."); + } + }(); + unsigned bits_per_sample = [&]() { + switch (dtype.num()) { + case 1: // byte + return 8; + case 3: //short + return 16; + case 5: // int32 + return 32; + case 11: // float + return 32; + default: + throw std::runtime_error("Unsupported dtype."); + } + }(); + return sox_encodinginfo_t{ + /*encoding=*/encoding, + /*bits_per_sample=*/bits_per_sample, + /*compression=*/HUGE_VAL, + /*reverse_bytes=*/sox_option_default, + /*reverse_nibbles=*/sox_option_default, + /*reverse_bits=*/sox_option_default, + /*opposite_endian=*/sox_false}; +} + +sox_encodinginfo_t get_encodinginfo_for_save( + const std::string& format, + const py::dtype dtype, + const tl::optional compression, + const tl::optional encoding, + const tl::optional bits_per_sample) { + auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample); + return sox_encodinginfo_t{ + /*encoding=*/std::get<0>(enc), + /*bits_per_sample=*/std::get<1>(enc), + /*compression=*/compression.value_or(HUGE_VAL), + /*reverse_bytes=*/sox_option_default, + /*reverse_nibbles=*/sox_option_default, + /*reverse_bits=*/sox_option_default, + /*opposite_endian=*/sox_false}; +} + + /* SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {} SoxFormat::~SoxFormat() { close(); } diff --git a/paddlespeech/audio/src/pybind/sox/utils.h b/paddlespeech/audio/src/pybind/sox/utils.h index fa931b1a9..65223bc0c 100644 --- a/paddlespeech/audio/src/pybind/sox/utils.h +++ b/paddlespeech/audio/src/pybind/sox/utils.h @@ -7,8 +7,6 @@ #include #include #include "paddlespeech/audio/src/optional/optional.hpp" -#include "paddlespeech/audio/src/sox/utils.h" -#include "paddlespeech/audio/src/sox/types.h" namespace py = pybind11; @@ -17,5 +15,102 @@ namespace sox_utils { auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t; +void set_seed(const int64_t seed); + +void set_verbosity(const int64_t verbosity); + +void set_use_threads(const bool use_threads); + +void set_buffer_size(const int64_t buffer_size); + +int64_t get_buffer_size(); + +std::vector> list_effects(); + +std::vector list_read_formats(); + +std::vector list_write_formats(); + +//////////////////////////////////////////////////////////////////////////////// +// Utilities for sox_io / sox_effects implementations +//////////////////////////////////////////////////////////////////////////////// + +const std::unordered_set UNSUPPORTED_EFFECTS = + {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"}; + +/// helper class to automatically close sox_format_t* +struct SoxFormat { + explicit SoxFormat(sox_format_t* fd) noexcept; + SoxFormat(const SoxFormat& other) = delete; + SoxFormat(SoxFormat&& other) = delete; + SoxFormat& operator=(const SoxFormat& other) = delete; + SoxFormat& operator=(SoxFormat&& other) = delete; + ~SoxFormat(); + sox_format_t* operator->() const noexcept; + operator sox_format_t*() const noexcept; + + void close(); + + private: + sox_format_t* fd_; +}; + +/// +/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32 +void validate_input_tensor(const py::array); + +void validate_input_file(const SoxFormat& sf, const std::string& path); + +void validate_input_memfile(const SoxFormat &sf); +/// +/// Get target dtype for the given encoding and precision. +py::dtype get_dtype( + const sox_encoding_t encoding, + const unsigned precision); + +/// +/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor +/// NOTE: This function might modify the values in the input buffer to +/// reduce the number of memory copy. +/// @param buffer Pointer to buffer that contains audio data. +/// @param num_samples The number of samples to read. +/// @param num_channels The number of channels. Used to reshape the resulting +/// Tensor. +/// @param dtype Target dtype. Determines the output dtype and value range in +/// conjunction with normalization. +/// @param noramlize Perform normalization. Only effective when dtype is not +/// kFloat32. When effective, the output tensor is kFloat32 type and value range +/// is [-1.0, 1.0] +/// @param channels_first When True, output Tensor has shape of [num_channels, +/// num_frames]. +py::array convert_to_tensor( + sox_sample_t* buffer, + const int32_t num_samples, + const int32_t num_channels, + const py::dtype dtype, + const bool normalize, + const bool channels_first); + +/// Extract extension from file path +const std::string get_filetype(const std::string path); + +/// Get sox_signalinfo_t for passing a py::array object. +sox_signalinfo_t get_signalinfo( + const py::array* waveform, + const int64_t sample_rate, + const std::string filetype, + const bool channels_first); + +/// Get sox_encodinginfo_t for Tensor I/O +sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype); + +/// Get sox_encodinginfo_t for saving to file/file object +sox_encodinginfo_t get_encodinginfo_for_save( + const std::string& format, + const py::dtype dtype, + const tl::optional compression, + const tl::optional encoding, + const tl::optional bits_per_sample); + } // namespace paddleaudio } // namespace sox_utils From 5e30f925175da9abb651834f223cda6104d511ad Mon Sep 17 00:00:00 2001 From: YangZhou Date: Fri, 29 Jul 2022 16:11:07 +0800 Subject: [PATCH 03/11] clean code --- paddlespeech/audio/src/sox/effects.cpp | 147 ------ paddlespeech/audio/src/sox/effects.h | 29 -- paddlespeech/audio/src/sox/effects_chain.cpp | 342 ------------- paddlespeech/audio/src/sox/effects_chain.h | 62 --- paddlespeech/audio/src/sox/io.cpp | 131 ----- paddlespeech/audio/src/sox/io.h | 41 -- paddlespeech/audio/src/sox/types.cpp | 143 ------ paddlespeech/audio/src/sox/types.h | 58 --- paddlespeech/audio/src/sox/utils.cpp | 488 ------------------- paddlespeech/audio/src/sox/utils.h | 120 ----- 10 files changed, 1561 deletions(-) delete mode 100644 paddlespeech/audio/src/sox/effects.cpp delete mode 100644 paddlespeech/audio/src/sox/effects.h delete mode 100644 paddlespeech/audio/src/sox/effects_chain.cpp delete mode 100644 paddlespeech/audio/src/sox/effects_chain.h delete mode 100644 paddlespeech/audio/src/sox/io.cpp delete mode 100644 paddlespeech/audio/src/sox/io.h delete mode 100644 paddlespeech/audio/src/sox/types.cpp delete mode 100644 paddlespeech/audio/src/sox/types.h delete mode 100644 paddlespeech/audio/src/sox/utils.cpp delete mode 100644 paddlespeech/audio/src/sox/utils.h diff --git a/paddlespeech/audio/src/sox/effects.cpp b/paddlespeech/audio/src/sox/effects.cpp deleted file mode 100644 index f2687f93f..000000000 --- a/paddlespeech/audio/src/sox/effects.cpp +++ /dev/null @@ -1,147 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp - -#include -#include - -#include "paddlespeech/audio/src/sox/effects.h" -#include "paddlespeech/audio/src/sox/effects_chain.h" -#include "paddlespeech/audio/src/sox/utils.h" - -using namespace paddleaudio::sox_utils; - -namespace paddleaudio::sox_effects { - -namespace { - -enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown }; -SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized; -std::mutex SOX_RESOUCE_STATE_MUTEX; - -} // namespace - -void initialize_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); - - switch (SOX_RESOURCE_STATE) { - case NotInitialized: - if (sox_init() != SOX_SUCCESS) { - throw std::runtime_error("Failed to initialize sox effects."); - }; - SOX_RESOURCE_STATE = Initialized; - break; - case Initialized: - break; - case ShutDown: - throw std::runtime_error( - "SoX Effects has been shut down. Cannot initialize again."); - } -}; - -void shutdown_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); - - switch (SOX_RESOURCE_STATE) { - case NotInitialized: - throw std::runtime_error( - "SoX Effects is not initialized. Cannot shutdown."); - case Initialized: - if (sox_quit() != SOX_SUCCESS) { - throw std::runtime_error("Failed to initialize sox effects."); - }; - SOX_RESOURCE_STATE = ShutDown; - break; - case ShutDown: - break; - } -} - -auto apply_effects_tensor( - py::array waveform, - int64_t sample_rate, - const std::vector>& effects, - bool channels_first) -> std::tuple { - validate_input_tensor(waveform); - - // Create SoxEffectsChain - const auto dtype = waveform.dtype(); - paddleaudio::sox_effects_chain::SoxEffectsChain chain( - /*input_encoding=*/get_tensor_encodinginfo(dtype), - /*output_encoding=*/get_tensor_encodinginfo(dtype)); - - // Prepare output buffer - std::vector out_buffer; - out_buffer.reserve(waveform.size()); - - // Build and run effects chain - chain.addInputTensor(&waveform, sample_rate, channels_first); - for (const auto& effect : effects) { - chain.addEffect(effect); - } - chain.addOutputBuffer(&out_buffer); - chain.run(); - - // Create tensor from buffer - auto out_tensor = convert_to_tensor( - /*buffer=*/out_buffer.data(), - /*num_samples=*/out_buffer.size(), - /*num_channels=*/chain.getOutputNumChannels(), - dtype, - /*normalize=*/false, - channels_first); - - return std::tuple( - out_tensor, chain.getOutputSampleRate()); -} - -auto apply_effects_file( - const std::string& path, - const std::vector>& effects, - tl::optional normalize, - tl::optional channels_first, - const tl::optional& format) - -> tl::optional> { - // Open input file - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); - - if (static_cast(sf) == nullptr || - sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { - return {}; - } - - const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision); - - // Prepare output - std::vector out_buffer; - out_buffer.reserve(sf->signal.length); - - // Create and run SoxEffectsChain - paddleaudio::sox_effects_chain::SoxEffectsChain chain( - /*input_encoding=*/sf->encoding, - /*output_encoding=*/get_tensor_encodinginfo(dtype)); - - chain.addInputFile(sf); - for (const auto& effect : effects) { - chain.addEffect(effect); - } - chain.addOutputBuffer(&out_buffer); - chain.run(); - - // Create tensor from buffer - bool channels_first_ = channels_first.value_or(true); - auto tensor = convert_to_tensor( - /*buffer=*/out_buffer.data(), - /*num_samples=*/out_buffer.size(), - /*num_channels=*/chain.getOutputNumChannels(), - dtype, - normalize.value_or(true), - channels_first_); - - return std::tuple( - tensor, chain.getOutputSampleRate()); -} - -} // namespace paddleaudio::sox_effects diff --git a/paddlespeech/audio/src/sox/effects.h b/paddlespeech/audio/src/sox/effects.h deleted file mode 100644 index 81db23b44..000000000 --- a/paddlespeech/audio/src/sox/effects.h +++ /dev/null @@ -1,29 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h -#pragma once - -#include -#include "paddlespeech/audio/src/sox/utils.h" - -namespace py = pybind11; - -namespace paddleaudio::sox_effects { - -void initialize_sox_effects(); - -void shutdown_sox_effects(); - -auto apply_effects_tensor( - py::array waveform, - int64_t sample_rate, - const std::vector>& effects, - bool channels_first) -> std::tuple; - -auto apply_effects_file( - const std::string& path, - const std::vector>& effects, - tl::optional normalize, - tl::optional channels_first, - const tl::optional& format) - -> tl::optional>; - -} // namespace torchaudio::sox_effects diff --git a/paddlespeech/audio/src/sox/effects_chain.cpp b/paddlespeech/audio/src/sox/effects_chain.cpp deleted file mode 100644 index 1b13fd186..000000000 --- a/paddlespeech/audio/src/sox/effects_chain.cpp +++ /dev/null @@ -1,342 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp - -#include "paddlespeech/audio/src/sox/effects_chain.h" -#include "paddlespeech/audio/src/sox/utils.h" - -using namespace paddleaudio::sox_utils; - -namespace paddleaudio { -namespace sox_effects_chain { - -namespace { - -/// helper classes for passing the location of input tensor and output buffer -/// -/// drain/flow callback functions require plaing C style function signature and -/// the way to pass extra data is to attach data to sox_effect_t::priv pointer. -/// The following structs will be assigned to sox_effect_t::priv pointer which -/// gives sox_effect_t an access to input Tensor and output buffer object. -struct TensorInputPriv { - size_t index; - py::array* waveform; - int64_t sample_rate; - bool channels_first; -}; - -struct TensorOutputPriv { - std::vector* buffer; -}; -struct FileOutputPriv { - sox_format_t* sf; -}; - -/// Callback function to feed Tensor data to SoxEffectChain. -int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { - // Retrieve the input Tensor and current index - auto priv = static_cast(effp->priv); - auto index = priv->index; - auto tensor = *(priv->waveform); - auto num_channels = effp->out_signal.channels; - - // Adjust the number of samples to read - const size_t num_samples = tensor.size(); - if (index + *osamp > num_samples) { - *osamp = num_samples - index; - } - // Ensure that it's a multiple of the number of channels - *osamp -= *osamp % num_channels; - - // Slice the input Tensor - // refacor this module, chunk - auto i_frame = index / num_channels; - auto num_frames = *osamp / num_channels; - py::array chunk(tensor.dtype(), {num_frames*num_channels}); - py::buffer_info ori_info = tensor.request(); - py::buffer_info info = chunk.request(); - char* ori_start_ptr = (char*)ori_info.ptr + index * chunk.itemsize() / sizeof(char); - std::memcpy(info.ptr, ori_start_ptr, chunk.nbytes()); - - py::dtype chunk_type = py::dtype("i"); // dtype int32 - py::array new_chunk = py::array(chunk_type, chunk.shape()); - py::buffer_info new_info = new_chunk.request(); - void* ptr = (void*) info.ptr; - int* new_ptr = (int*) new_info.ptr; - // Convert to sox_sample_t (int32_t) - switch (chunk.dtype().num()) { - //case c10::ScalarType::Float: { - case 11: { - // Need to convert to 64-bit precision so that - // values around INT32_MIN/MAX are handled correctly. - float* ptr_f = (float*)ptr; - for (int idx = 0; idx < chunk.size(); ++idx) { - double elem = *ptr_f * 2147483648.; - // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX); - if (elem > INT32_MAX) { - *new_ptr = INT32_MAX; - } else if (elem < INT32_MIN) { - *new_ptr = INT32_MIN; - } else { *new_ptr = elem; } - } - break; - } - //case c10::ScalarType::Int: { - case 5: { - break; - } - // case short - case 3: { - int16_t* ptr_s = (int16_t*) ptr; - for (int idx = 0; idx < chunk.size(); ++idx) { - *new_ptr = *ptr_s * 65536; - } - break; - } - // case byte - case 1: { - int8_t* ptr_b = (int8_t*) ptr; - for (int idx = 0; idx < chunk.size(); ++idx) { - *new_ptr = (*ptr_b - 128) * 16777216; - } - break; - } - default: - throw std::runtime_error("Unexpected dtype."); - } - // Write to buffer - memcpy(obuf, (int*)new_info.ptr, *osamp * 4); - priv->index += *osamp; - return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS; -} - -/// Callback function to fetch data from SoxEffectChain. -int tensor_output_flow( - sox_effect_t* effp, - sox_sample_t const* ibuf, - sox_sample_t* obuf LSX_UNUSED, - size_t* isamp, - size_t* osamp) { - *osamp = 0; - // Get output buffer - auto out_buffer = static_cast(effp->priv)->buffer; - // Append at the end - out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp); - return SOX_SUCCESS; -} - -int file_output_flow( - sox_effect_t* effp, - sox_sample_t const* ibuf, - sox_sample_t* obuf LSX_UNUSED, - size_t* isamp, - size_t* osamp) { - *osamp = 0; - if (*isamp) { - auto sf = static_cast(effp->priv)->sf; - if (sox_write(sf, ibuf, *isamp) != *isamp) { - if (sf->sox_errno) { - std::ostringstream stream; - stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " " - << sf->filename; - throw std::runtime_error(stream.str()); - } - return SOX_EOF; - } - } - return SOX_SUCCESS; -} - -sox_effect_handler_t* get_tensor_input_handler() { - static sox_effect_handler_t handler{ - /*name=*/"input_tensor", - /*usage=*/NULL, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/NULL, - /*start=*/NULL, - /*flow=*/NULL, - /*drain=*/tensor_input_drain, - /*stop=*/NULL, - /*kill=*/NULL, - /*priv_size=*/sizeof(TensorInputPriv)}; - return &handler; -} - -sox_effect_handler_t* get_tensor_output_handler() { - static sox_effect_handler_t handler{ - /*name=*/"output_tensor", - /*usage=*/NULL, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/NULL, - /*start=*/NULL, - /*flow=*/tensor_output_flow, - /*drain=*/NULL, - /*stop=*/NULL, - /*kill=*/NULL, - /*priv_size=*/sizeof(TensorOutputPriv)}; - return &handler; -} - -sox_effect_handler_t* get_file_output_handler() { - static sox_effect_handler_t handler{ - /*name=*/"output_file", - /*usage=*/NULL, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/NULL, - /*start=*/NULL, - /*flow=*/file_output_flow, - /*drain=*/NULL, - /*stop=*/NULL, - /*kill=*/NULL, - /*priv_size=*/sizeof(FileOutputPriv)}; - return &handler; -} - -} // namespace - -SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {} - -SoxEffect::~SoxEffect() { - if (se_ != nullptr) { - free(se_); - } -} - -SoxEffect::operator sox_effect_t*() const { - return se_; -} - -auto SoxEffect::operator->() noexcept -> sox_effect_t* { - return se_; -} - -SoxEffectsChain::SoxEffectsChain( - sox_encodinginfo_t input_encoding, - sox_encodinginfo_t output_encoding) - : in_enc_(input_encoding), - out_enc_(output_encoding), - in_sig_(), - interm_sig_(), - out_sig_(), - sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) { - if (!sec_) { - throw std::runtime_error("Failed to create effect chain."); - } -} - -SoxEffectsChain::~SoxEffectsChain() { - if (sec_ != nullptr) { - sox_delete_effects_chain(sec_); - } -} - -void SoxEffectsChain::run() { - sox_flow_effects(sec_, NULL, NULL); -} - -void SoxEffectsChain::addInputTensor( - py::array* waveform, - int64_t sample_rate, - bool channels_first) { - in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first); - interm_sig_ = in_sig_; - SoxEffect e(sox_create_effect(get_tensor_input_handler())); - auto priv = static_cast(e->priv); - priv->index = 0; - priv->waveform = waveform; - priv->sample_rate = sample_rate; - priv->channels_first = channels_first; - if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { - throw std::runtime_error( - "Internal Error: Failed to add effect: input_tensor"); - } -} - -void SoxEffectsChain::addOutputBuffer( - std::vector* output_buffer) { - SoxEffect e(sox_create_effect(get_tensor_output_handler())); - static_cast(e->priv)->buffer = output_buffer; - if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { - throw std::runtime_error( - "Internal Error: Failed to add effect: output_tensor"); - } -} - -void SoxEffectsChain::addInputFile(sox_format_t* sf) { - in_sig_ = sf->signal; - interm_sig_ = in_sig_; - SoxEffect e(sox_create_effect(sox_find_effect("input"))); - char* opts[] = {(char*)sf}; - sox_effect_options(e, 1, opts); - if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { - std::ostringstream stream; - stream << "Internal Error: Failed to add effect: input " << sf->filename; - throw std::runtime_error(stream.str()); - } -} - -void SoxEffectsChain::addOutputFile(sox_format_t* sf) { - out_sig_ = sf->signal; - SoxEffect e(sox_create_effect(get_file_output_handler())); - static_cast(e->priv)->sf = sf; - if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) { - std::ostringstream stream; - stream << "Internal Error: Failed to add effect: output " << sf->filename; - throw std::runtime_error(stream.str()); - } -} - -void SoxEffectsChain::addEffect(const std::vector effect) { - const auto num_args = effect.size(); - if (num_args == 0) { - throw std::runtime_error("Invalid argument: empty effect."); - } - const auto name = effect[0]; - if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) { - std::ostringstream stream; - stream << "Unsupported effect: " << name; - throw std::runtime_error(stream.str()); - } - - auto returned_effect = sox_find_effect(name.c_str()); - if (!returned_effect) { - std::ostringstream stream; - stream << "Unsupported effect: " << name; - throw std::runtime_error(stream.str()); - } - SoxEffect e(sox_create_effect(returned_effect)); - const auto num_options = num_args - 1; - - std::vector opts; - for (size_t i = 1; i < num_args; ++i) { - opts.push_back((char*)effect[i].c_str()); - } - if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) != - SOX_SUCCESS) { - std::ostringstream stream; - stream << "Invalid effect option:"; - for (const auto& v : effect) { - stream << " " << v; - } - throw std::runtime_error(stream.str()); - } - - if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) { - std::ostringstream stream; - stream << "Internal Error: Failed to add effect: \"" << name; - for (size_t i = 1; i < num_args; ++i) { - stream << " " << effect[i]; - } - stream << "\""; - throw std::runtime_error(stream.str()); - } -} - -int64_t SoxEffectsChain::getOutputNumChannels() { - return interm_sig_.channels; -} - -int64_t SoxEffectsChain::getOutputSampleRate() { - return interm_sig_.rate; -} - -} // namespace sox_effects_chain -} // namespace paddleaudio diff --git a/paddlespeech/audio/src/sox/effects_chain.h b/paddlespeech/audio/src/sox/effects_chain.h deleted file mode 100644 index 87a046975..000000000 --- a/paddlespeech/audio/src/sox/effects_chain.h +++ /dev/null @@ -1,62 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h -#pragma once - -#include -#include "paddlespeech/audio/src/sox/utils.h" - -namespace paddleaudio { -namespace sox_effects_chain { - -// Helper struct to safely close sox_effect_t* pointer returned by -// sox_create_effect - -struct SoxEffect { - explicit SoxEffect(sox_effect_t* se) noexcept; - SoxEffect(const SoxEffect& other) = delete; - SoxEffect(const SoxEffect&& other) = delete; - auto operator=(const SoxEffect& other) -> SoxEffect& = delete; - auto operator=(SoxEffect&& other) -> SoxEffect& = delete; - ~SoxEffect(); - operator sox_effect_t*() const; - auto operator->() noexcept -> sox_effect_t*; - - private: - sox_effect_t* se_; -}; - -// Helper struct to safely close sox_effects_chain_t with handy methods -class SoxEffectsChain { - const sox_encodinginfo_t in_enc_; - const sox_encodinginfo_t out_enc_; - - protected: - sox_signalinfo_t in_sig_; - sox_signalinfo_t interm_sig_; - sox_signalinfo_t out_sig_; - sox_effects_chain_t* sec_; - - public: - explicit SoxEffectsChain( - sox_encodinginfo_t input_encoding, - sox_encodinginfo_t output_encoding); - SoxEffectsChain(const SoxEffectsChain& other) = delete; - SoxEffectsChain(const SoxEffectsChain&& other) = delete; - SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete; - SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete; - ~SoxEffectsChain(); - void run(); - void addInputTensor( - py::array* waveform, - int64_t sample_rate, - bool channels_first); - void addInputFile(sox_format_t* sf); - void addOutputBuffer(std::vector* output_buffer); - void addOutputFile(sox_format_t* sf); - void addEffect(const std::vector effect); - int64_t getOutputNumChannels(); - int64_t getOutputSampleRate(); -}; - -} // namespace sox_effects_chain -} // namespace torchaudio - diff --git a/paddlespeech/audio/src/sox/io.cpp b/paddlespeech/audio/src/sox/io.cpp deleted file mode 100644 index 5a75fc987..000000000 --- a/paddlespeech/audio/src/sox/io.cpp +++ /dev/null @@ -1,131 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp -#include "paddlespeech/audio/src/sox/effects.h" -#include "paddlespeech/audio/src/sox/effects_chain.h" -#include "paddlespeech/audio/src/sox/io.h" -#include "paddlespeech/audio/src/sox/types.h" -#include "paddlespeech/audio/src/sox/utils.h" - -using namespace paddleaudio::sox_utils; - -namespace paddleaudio { -namespace sox_io { - -tl::optional get_info_file( - const std::string& path, const tl::optional& format) { - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); - - if (static_cast(sf) == nullptr || - sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { - return {}; - } - - return std::forward_as_tuple( - static_cast(sf->signal.rate), - static_cast(sf->signal.length / sf->signal.channels), - static_cast(sf->signal.channels), - static_cast(sf->encoding.bits_per_sample), - get_encoding(sf->encoding.encoding)); -} - -std::vector> get_effects( - const tl::optional& frame_offset, - const tl::optional& num_frames) { - const auto offset = frame_offset.value_or(0); - if (offset < 0) { - throw std::runtime_error( - "Invalid argument: frame_offset must be non-negative."); - } - const auto frames = num_frames.value_or(-1); - if (frames == 0 || frames < -1) { - throw std::runtime_error( - "Invalid argument: num_frames must be -1 or greater than 0."); - } - - std::vector> effects; - if (frames != -1) { - std::ostringstream os_offset, os_frames; - os_offset << offset << "s"; - os_frames << "+" << frames << "s"; - effects.emplace_back( - std::vector{"trim", os_offset.str(), os_frames.str()}); - } else if (offset != 0) { - std::ostringstream os_offset; - os_offset << offset << "s"; - effects.emplace_back(std::vector{"trim", os_offset.str()}); - } - return effects; -} - -tl::optional> load_audio_file( - const std::string& path, - const tl::optional& frame_offset, - const tl::optional& num_frames, - tl::optional normalize, - tl::optional channels_first, - const tl::optional& format) { - auto effects = get_effects(frame_offset, num_frames); - return paddleaudio::sox_effects::apply_effects_file( - path, effects, normalize, channels_first, format); -} - -void save_audio_file(const std::string& path, - py::array tensor, - int64_t sample_rate, - bool channels_first, - tl::optional compression, - tl::optional format, - tl::optional encoding, - tl::optional bits_per_sample) { - validate_input_tensor(tensor); - - const auto filetype = [&]() { - if (format.has_value()) return format.value(); - return get_filetype(path); - }(); - - if (filetype == "amr-nb") { - const auto num_channels = tensor.shape(channels_first ? 0 : 1); - //TORCH_CHECK(num_channels == 1, - // "amr-nb format only supports single channel audio."); - } else if (filetype == "htk") { - const auto num_channels = tensor.shape(channels_first ? 0 : 1); - // TORCH_CHECK(num_channels == 1, - // "htk format only supports single channel audio."); - } else if (filetype == "gsm") { - const auto num_channels = tensor.shape(channels_first ? 0 : 1); - //TORCH_CHECK(num_channels == 1, - // "gsm format only supports single channel audio."); - //TORCH_CHECK(sample_rate == 8000, - // "gsm format only supports a sampling rate of 8kHz."); - } - const auto signal_info = - get_signalinfo(&tensor, sample_rate, filetype, channels_first); - const auto encoding_info = get_encodinginfo_for_save( - filetype, tensor.dtype(), compression, encoding, bits_per_sample); - - SoxFormat sf(sox_open_write(path.c_str(), - &signal_info, - &encoding_info, - /*filetype=*/filetype.c_str(), - /*oob=*/nullptr, - /*overwrite_permitted=*/nullptr)); - - if (static_cast(sf) == nullptr) { - throw std::runtime_error( - "Error saving audio file: failed to open file " + path); - } - - paddleaudio::sox_effects_chain::SoxEffectsChain chain( - /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()), - /*output_encoding=*/sf->encoding); - chain.addInputTensor(&tensor, sample_rate, channels_first); - chain.addOutputFile(sf); - chain.run(); -} - -} // namespace sox_io -} // namespace paddleaudio \ No newline at end of file diff --git a/paddlespeech/audio/src/sox/io.h b/paddlespeech/audio/src/sox/io.h deleted file mode 100644 index f8001d872..000000000 --- a/paddlespeech/audio/src/sox/io.h +++ /dev/null @@ -1,41 +0,0 @@ - -// Copyright (c) 2017 Facebook Inc. (Soumith Chintala), -// All rights reserved. - -#pragma once - -#include "paddlespeech/audio/src/optional/optional.hpp" -#include "paddlespeech/audio/src/sox/utils.h" - -namespace paddleaudio { -namespace sox_io { - -auto get_effects(const tl::optional& frame_offset, - const tl::optional& num_frames) - -> std::vector>; - -using MetaDataTuple = - std::tuple; - -tl::optional get_info_file( - const std::string& path, const tl::optional& format); - -tl::optional> load_audio_file( - const std::string& path, - const tl::optional& frame_offset, - const tl::optional& num_frames, - tl::optional normalize, - tl::optional channels_first, - const tl::optional& format); - -void save_audio_file(const std::string& path, - py::array tensor, - int64_t sample_rate, - bool channels_first, - tl::optional compression, - tl::optional format, - tl::optional encoding, - tl::optional bits_per_sample); - -} // namespace sox_io -} // namespace paddleaudio diff --git a/paddlespeech/audio/src/sox/types.cpp b/paddlespeech/audio/src/sox/types.cpp deleted file mode 100644 index ab1808be1..000000000 --- a/paddlespeech/audio/src/sox/types.cpp +++ /dev/null @@ -1,143 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp - -#include "paddlespeech/audio/src/sox/types.h" -#include -#include - -namespace paddleaudio { -namespace sox_utils { - -Format get_format_from_string(const std::string& format) { - if (format == "wav") - return Format::WAV; - if (format == "mp3") - return Format::MP3; - if (format == "flac") - return Format::FLAC; - if (format == "ogg" || format == "vorbis") - return Format::VORBIS; - if (format == "amr-nb") - return Format::AMR_NB; - if (format == "amr-wb") - return Format::AMR_WB; - if (format == "amb") - return Format::AMB; - if (format == "sph") - return Format::SPHERE; - if (format == "htk") - return Format::HTK; - if (format == "gsm") - return Format::GSM; - std::ostringstream stream; - stream << "Internal Error: unexpected format value: " << format; - throw std::runtime_error(stream.str()); -} - -std::string to_string(Encoding v) { - switch (v) { - case Encoding::UNKNOWN: - return "UNKNOWN"; - case Encoding::PCM_SIGNED: - return "PCM_S"; - case Encoding::PCM_UNSIGNED: - return "PCM_U"; - case Encoding::PCM_FLOAT: - return "PCM_F"; - case Encoding::FLAC: - return "FLAC"; - case Encoding::ULAW: - return "ULAW"; - case Encoding::ALAW: - return "ALAW"; - case Encoding::MP3: - return "MP3"; - case Encoding::VORBIS: - return "VORBIS"; - case Encoding::AMR_WB: - return "AMR_WB"; - case Encoding::AMR_NB: - return "AMR_NB"; - case Encoding::OPUS: - return "OPUS"; - default: - throw std::runtime_error("Internal Error: unexpected encoding."); - } -} - -Encoding get_encoding_from_option(const tl::optional encoding) { - if (!encoding.has_value()) - return Encoding::NOT_PROVIDED; - std::string v = encoding.value(); - if (v == "PCM_S") - return Encoding::PCM_SIGNED; - if (v == "PCM_U") - return Encoding::PCM_UNSIGNED; - if (v == "PCM_F") - return Encoding::PCM_FLOAT; - if (v == "ULAW") - return Encoding::ULAW; - if (v == "ALAW") - return Encoding::ALAW; - std::ostringstream stream; - stream << "Internal Error: unexpected encoding value: " << v; - throw std::runtime_error(stream.str()); -} - -BitDepth get_bit_depth_from_option(const tl::optional bit_depth) { - if (!bit_depth.has_value()) - return BitDepth::NOT_PROVIDED; - int64_t v = bit_depth.value(); - switch (v) { - case 8: - return BitDepth::B8; - case 16: - return BitDepth::B16; - case 24: - return BitDepth::B24; - case 32: - return BitDepth::B32; - case 64: - return BitDepth::B64; - default: { - std::ostringstream s; - s << "Internal Error: unexpected bit depth value: " << v; - throw std::runtime_error(s.str()); - } - } -} - -std::string get_encoding(sox_encoding_t encoding) { - switch (encoding) { - case SOX_ENCODING_UNKNOWN: - return "UNKNOWN"; - case SOX_ENCODING_SIGN2: - return "PCM_S"; - case SOX_ENCODING_UNSIGNED: - return "PCM_U"; - case SOX_ENCODING_FLOAT: - return "PCM_F"; - case SOX_ENCODING_FLAC: - return "FLAC"; - case SOX_ENCODING_ULAW: - return "ULAW"; - case SOX_ENCODING_ALAW: - return "ALAW"; - case SOX_ENCODING_MP3: - return "MP3"; - case SOX_ENCODING_VORBIS: - return "VORBIS"; - case SOX_ENCODING_AMR_WB: - return "AMR_WB"; - case SOX_ENCODING_AMR_NB: - return "AMR_NB"; - case SOX_ENCODING_OPUS: - return "OPUS"; - case SOX_ENCODING_GSM: - return "GSM"; - default: - return "UNKNOWN"; - } -} - -} // namespace sox_utils -} // namespace paddleaudio diff --git a/paddlespeech/audio/src/sox/types.h b/paddlespeech/audio/src/sox/types.h deleted file mode 100644 index 824c0f632..000000000 --- a/paddlespeech/audio/src/sox/types.h +++ /dev/null @@ -1,58 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h -#pragma once - -#include -#include "paddlespeech/audio/src/optional/optional.hpp" - -namespace paddleaudio { -namespace sox_utils { - -enum class Format { - WAV, - MP3, - FLAC, - VORBIS, - AMR_NB, - AMR_WB, - AMB, - SPHERE, - GSM, - HTK, -}; - -Format get_format_from_string(const std::string& format); - -enum class Encoding { - NOT_PROVIDED, - UNKNOWN, - PCM_SIGNED, - PCM_UNSIGNED, - PCM_FLOAT, - FLAC, - ULAW, - ALAW, - MP3, - VORBIS, - AMR_WB, - AMR_NB, - OPUS, -}; - -std::string to_string(Encoding v); -Encoding get_encoding_from_option(const tl::optional encoding); - -enum class BitDepth : unsigned { - NOT_PROVIDED = 0, - B8 = 8, - B16 = 16, - B24 = 24, - B32 = 32, - B64 = 64, -}; - -BitDepth get_bit_depth_from_option(const tl::optional bit_depth); - -std::string get_encoding(sox_encoding_t encoding); - -} // namespace sox_utils -} // namespace torchaudio \ No newline at end of file diff --git a/paddlespeech/audio/src/sox/utils.cpp b/paddlespeech/audio/src/sox/utils.cpp deleted file mode 100644 index a44031bb4..000000000 --- a/paddlespeech/audio/src/sox/utils.cpp +++ /dev/null @@ -1,488 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp - -#include -#include "paddlespeech/audio/src/sox/types.h" -#include "paddlespeech/audio/src/sox/utils.h" - -namespace paddleaudio { -namespace sox_utils { - -void set_seed(const int64_t seed) { - sox_get_globals()->ranqd1 = static_cast(seed); -} - -void set_verbosity(const int64_t verbosity) { - sox_get_globals()->verbosity = static_cast(verbosity); -} - -void set_use_threads(const bool use_threads) { - sox_get_globals()->use_threads = static_cast(use_threads); -} - -void set_buffer_size(const int64_t buffer_size) { - sox_get_globals()->bufsiz = static_cast(buffer_size); -} - -int64_t get_buffer_size() { - return sox_get_globals()->bufsiz; -} - -std::vector> list_effects() { - std::vector> effects; - for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) { - const sox_effect_handler_t* handler = (*fns)(); - if (handler && handler->name) { - if (UNSUPPORTED_EFFECTS.find(handler->name) == - UNSUPPORTED_EFFECTS.end()) { - effects.emplace_back(std::vector{ - handler->name, - handler->usage ? std::string(handler->usage) : std::string("")}); - } - } - } - return effects; -} - -std::vector list_write_formats() { - std::vector formats; - for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { - const sox_format_handler_t* handler = fns->fn(); - for (const char* const* names = handler->names; *names; ++names) { - if (!strchr(*names, '/') && handler->write) - formats.emplace_back(*names); - } - } - return formats; -} - -std::vector list_read_formats() { - std::vector formats; - for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { - const sox_format_handler_t* handler = fns->fn(); - for (const char* const* names = handler->names; *names; ++names) { - if (!strchr(*names, '/') && handler->read) - formats.emplace_back(*names); - } - } - return formats; -} - -SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {} -SoxFormat::~SoxFormat() { - close(); -} - -sox_format_t* SoxFormat::operator->() const noexcept { - return fd_; -} -SoxFormat::operator sox_format_t*() const noexcept { - return fd_; -} - -void SoxFormat::close() { - if (fd_ != nullptr) { - sox_close(fd_); - fd_ = nullptr; - } -} - -void validate_input_file(const SoxFormat& sf, const std::string& path) { - if (static_cast(sf) == nullptr) { - throw std::runtime_error( - "Error loading audio file: failed to open file " + path); - } - if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) { - throw std::runtime_error("Error loading audio file: unknown encoding."); - } -} - -void validate_input_memfile(const SoxFormat &sf) { - return validate_input_file(sf, ""); -} - -void validate_input_tensor(const py::array tensor) { - if (tensor.ndim() != 2) { - throw std::runtime_error("Input tensor has to be 2D."); - } - - char dtype = tensor.dtype().char_(); - bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i'); - if (flag == false) { - throw std::runtime_error( - "Input tensor has to be one of float32, int32, int16 or uint8 type."); - } -} - -py::dtype get_dtype( - const sox_encoding_t encoding, - const unsigned precision) { - switch (encoding) { - case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV - return py::dtype('u1'); - case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV - switch (precision) { - case 16: - return py::dtype("i2"); - case 24: // Cast 24-bit to 32-bit. - case 32: - return py::dtype('i'); - default: - throw std::runtime_error( - "Only 16, 24, and 32 bits are supported for signed PCM."); - } - default: - // default to float32 for the other formats, including - // 32-bit flaoting-point WAV, - // MP3, - // FLAC, - // VORBIS etc... - return py::dtype("f"); - } -} - -py::array convert_to_tensor( - sox_sample_t* buffer, - const int32_t num_samples, - const int32_t num_channels, - const py::dtype dtype, - const bool normalize, - const bool channels_first) { - py::array t; - uint64_t dummy = 0; - SOX_SAMPLE_LOCALS; - if (normalize || dtype.char_() == 'f') { - t = py::array(dtype, {num_samples / num_channels, num_channels}); - auto ptr = (float*)t.mutable_data(0, 0); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy); - } - } else if (dtype.char_() == 'i') { - //t = torch::from_blob( - // buffer, {num_samples / num_channels, num_channels}, torch::kInt32) - // .clone(); - t = py::array(dtype, {num_samples / num_channels, num_channels}); - auto ptr = (int*)t.mutable_data(0, 0); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = buffer[i]; - } - } else if (dtype.char_() == 'h') { // int16 - t = py::array(dtype, {num_samples / num_channels, num_channels}); - auto ptr = (int16_t*)t.mutable_data(0, 0); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy); - } - } else if (dtype.char_() == 'b') { - //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8); - auto ptr = (uint8_t*)t.mutable_data(0,0); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy); - } - } else { - throw std::runtime_error("Unsupported dtype."); - } - return t; -} - -const std::string get_filetype(const std::string path) { - std::string ext = path.substr(path.find_last_of(".") + 1); - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - return ext; -} - -namespace { - -std::tuple get_save_encoding_for_wav( - const std::string format, - py::dtype dtype, - const Encoding& encoding, - const BitDepth& bits_per_sample) { - switch (encoding) { - case Encoding::NOT_PROVIDED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - switch (dtype.num()) { - case 11: // float32 numpy dtype num - return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); - case 5: // int numpy dtype num - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - case 3: // int16 numpy - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case 1: // byte numpy - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - throw std::runtime_error("Internal Error: Unexpected dtype."); - } - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); - } - case Encoding::PCM_SIGNED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - case BitDepth::B8: - throw std::runtime_error( - format + " does not support 8-bit signed PCM encoding."); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); - } - case Encoding::PCM_UNSIGNED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - throw std::runtime_error( - format + " only supports 8-bit for unsigned PCM encoding."); - } - case Encoding::PCM_FLOAT: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B32: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); - case BitDepth::B64: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 64); - default: - throw std::runtime_error( - format + - " only supports 32-bit or 64-bit for floating-point PCM encoding."); - } - case Encoding::ULAW: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ULAW, 8); - default: - throw std::runtime_error( - format + " only supports 8-bit for mu-law encoding."); - } - case Encoding::ALAW: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ALAW, 8); - default: - throw std::runtime_error( - format + " only supports 8-bit for a-law encoding."); - } - default: - throw std::runtime_error( - format + " does not support encoding: " + to_string(encoding)); - } -} - -std::tuple get_save_encoding( - const std::string& format, - const py::dtype dtype, - const tl::optional encoding, - const tl::optional bits_per_sample) { - const Format fmt = get_format_from_string(format); - const Encoding enc = get_encoding_from_option(encoding); - const BitDepth bps = get_bit_depth_from_option(bits_per_sample); - - switch (fmt) { - case Format::WAV: - case Format::AMB: - return get_save_encoding_for_wav(format, dtype, enc, bps); - case Format::MP3: - if (enc != Encoding::NOT_PROVIDED) - throw std::runtime_error("mp3 does not support `encoding` option."); - if (bps != BitDepth::NOT_PROVIDED) - throw std::runtime_error( - "mp3 does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_MP3, 16); - case Format::HTK: - if (enc != Encoding::NOT_PROVIDED) - throw std::runtime_error("htk does not support `encoding` option."); - if (bps != BitDepth::NOT_PROVIDED) - throw std::runtime_error( - "htk does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case Format::VORBIS: - if (enc != Encoding::NOT_PROVIDED) - throw std::runtime_error("vorbis does not support `encoding` option."); - if (bps != BitDepth::NOT_PROVIDED) - throw std::runtime_error( - "vorbis does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_VORBIS, 16); - case Format::AMR_NB: - if (enc != Encoding::NOT_PROVIDED) - throw std::runtime_error("amr-nb does not support `encoding` option."); - if (bps != BitDepth::NOT_PROVIDED) - throw std::runtime_error( - "amr-nb does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16); - case Format::FLAC: - if (enc != Encoding::NOT_PROVIDED) - throw std::runtime_error("flac does not support `encoding` option."); - switch (bps) { - case BitDepth::B32: - case BitDepth::B64: - throw std::runtime_error( - "flac does not support `bits_per_sample` larger than 24."); - default: - return std::make_tuple<>( - SOX_ENCODING_FLAC, static_cast(bps)); - } - case Format::SPHERE: - switch (enc) { - case Encoding::NOT_PROVIDED: - case Encoding::PCM_SIGNED: - switch (bps) { - case BitDepth::NOT_PROVIDED: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bps)); - } - case Encoding::PCM_UNSIGNED: - throw std::runtime_error( - "sph does not support unsigned integer PCM."); - case Encoding::PCM_FLOAT: - throw std::runtime_error("sph does not support floating point PCM."); - case Encoding::ULAW: - switch (bps) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ULAW, 8); - default: - throw std::runtime_error( - "sph only supports 8-bit for mu-law encoding."); - } - case Encoding::ALAW: - switch (bps) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ALAW, 8); - default: - return std::make_tuple<>( - SOX_ENCODING_ALAW, static_cast(bps)); - } - default: - throw std::runtime_error( - "sph does not support encoding: " + encoding.value()); - } - case Format::GSM: - if (enc != Encoding::NOT_PROVIDED) - throw std::runtime_error("gsm does not support `encoding` option."); - if (bps != BitDepth::NOT_PROVIDED) - throw std::runtime_error( - "gsm does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_GSM, 16); - - default: - throw std::runtime_error("Unsupported format: " + format); - } -} - -unsigned get_precision(const std::string filetype, py::dtype dtype) { - if (filetype == "mp3") - return SOX_UNSPEC; - if (filetype == "flac") - return 24; - if (filetype == "ogg" || filetype == "vorbis") - return SOX_UNSPEC; - if (filetype == "wav" || filetype == "amb") { - switch (dtype.num()) { - case 1: // byte in numpy dype num - return 8; - case 3: // short, in numpy dtype num - return 16; - case 5: // int, numpy dtype - return 32; - case 11: // float, numpy dtype - return 32; - default: - throw std::runtime_error("Unsupported dtype."); - } - } - if (filetype == "sph") - return 32; - if (filetype == "amr-nb") { - return 16; - } - if (filetype == "gsm") { - return 16; - } - if (filetype == "htk") { - return 16; - } - throw std::runtime_error("Unsupported file type: " + filetype); -} - -} // namespace - -sox_signalinfo_t get_signalinfo( - const py::array* waveform, - const int64_t sample_rate, - const std::string filetype, - const bool channels_first) { - return sox_signalinfo_t{ - /*rate=*/static_cast(sample_rate), - /*channels=*/ - static_cast(waveform->shape(channels_first ? 0 : 1)), - /*precision=*/get_precision(filetype, waveform->dtype()), - /*length=*/static_cast(waveform->size())}; -} - -sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) { - sox_encoding_t encoding = [&]() { - switch (dtype.num()) { - case 1: // byte - return SOX_ENCODING_UNSIGNED; - case 3: // short - return SOX_ENCODING_SIGN2; - case 5: // int32 - return SOX_ENCODING_SIGN2; - case 11: // float - return SOX_ENCODING_FLOAT; - default: - throw std::runtime_error("Unsupported dtype."); - } - }(); - unsigned bits_per_sample = [&]() { - switch (dtype.num()) { - case 1: // byte - return 8; - case 3: //short - return 16; - case 5: // int32 - return 32; - case 11: // float - return 32; - default: - throw std::runtime_error("Unsupported dtype."); - } - }(); - return sox_encodinginfo_t{ - /*encoding=*/encoding, - /*bits_per_sample=*/bits_per_sample, - /*compression=*/HUGE_VAL, - /*reverse_bytes=*/sox_option_default, - /*reverse_nibbles=*/sox_option_default, - /*reverse_bits=*/sox_option_default, - /*opposite_endian=*/sox_false}; -} - -sox_encodinginfo_t get_encodinginfo_for_save( - const std::string& format, - const py::dtype dtype, - const tl::optional compression, - const tl::optional encoding, - const tl::optional bits_per_sample) { - auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample); - return sox_encodinginfo_t{ - /*encoding=*/std::get<0>(enc), - /*bits_per_sample=*/std::get<1>(enc), - /*compression=*/compression.value_or(HUGE_VAL), - /*reverse_bytes=*/sox_option_default, - /*reverse_nibbles=*/sox_option_default, - /*reverse_bits=*/sox_option_default, - /*opposite_endian=*/sox_false}; -} - -} // namespace sox_utils -} // namespace torchaudio diff --git a/paddlespeech/audio/src/sox/utils.h b/paddlespeech/audio/src/sox/utils.h deleted file mode 100644 index 5b015ece0..000000000 --- a/paddlespeech/audio/src/sox/utils.h +++ /dev/null @@ -1,120 +0,0 @@ -//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h - -#pragma once - -#include -#include -#include - -#include "paddlespeech/audio/src/optional/optional.hpp" - -namespace py = pybind11; - -namespace paddleaudio { -namespace sox_utils { - -//////////////////////////////////////////////////////////////////////////////// -// APIs for Python interaction -//////////////////////////////////////////////////////////////////////////////// - -/// Set sox global options -void set_seed(const int64_t seed); - -void set_verbosity(const int64_t verbosity); - -void set_use_threads(const bool use_threads); - -void set_buffer_size(const int64_t buffer_size); - -int64_t get_buffer_size(); - -std::vector> list_effects(); - -std::vector list_read_formats(); - -std::vector list_write_formats(); - -//////////////////////////////////////////////////////////////////////////////// -// Utilities for sox_io / sox_effects implementations -//////////////////////////////////////////////////////////////////////////////// - -const std::unordered_set UNSUPPORTED_EFFECTS = - {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"}; - -/// helper class to automatically close sox_format_t* -struct SoxFormat { - explicit SoxFormat(sox_format_t* fd) noexcept; - SoxFormat(const SoxFormat& other) = delete; - SoxFormat(SoxFormat&& other) = delete; - SoxFormat& operator=(const SoxFormat& other) = delete; - SoxFormat& operator=(SoxFormat&& other) = delete; - ~SoxFormat(); - sox_format_t* operator->() const noexcept; - operator sox_format_t*() const noexcept; - - void close(); - - private: - sox_format_t* fd_; -}; - -/// -/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32 -void validate_input_tensor(const py::array); - -void validate_input_file(const SoxFormat& sf, const std::string& path); - -void validate_input_memfile(const SoxFormat &sf); -/// -/// Get target dtype for the given encoding and precision. -py::dtype get_dtype( - const sox_encoding_t encoding, - const unsigned precision); - -/// -/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor -/// NOTE: This function might modify the values in the input buffer to -/// reduce the number of memory copy. -/// @param buffer Pointer to buffer that contains audio data. -/// @param num_samples The number of samples to read. -/// @param num_channels The number of channels. Used to reshape the resulting -/// Tensor. -/// @param dtype Target dtype. Determines the output dtype and value range in -/// conjunction with normalization. -/// @param noramlize Perform normalization. Only effective when dtype is not -/// kFloat32. When effective, the output tensor is kFloat32 type and value range -/// is [-1.0, 1.0] -/// @param channels_first When True, output Tensor has shape of [num_channels, -/// num_frames]. -py::array convert_to_tensor( - sox_sample_t* buffer, - const int32_t num_samples, - const int32_t num_channels, - const py::dtype dtype, - const bool normalize, - const bool channels_first); - -/// Extract extension from file path -const std::string get_filetype(const std::string path); - -/// Get sox_signalinfo_t for passing a py::array object. -sox_signalinfo_t get_signalinfo( - const py::array* waveform, - const int64_t sample_rate, - const std::string filetype, - const bool channels_first); - -/// Get sox_encodinginfo_t for Tensor I/O -sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype); - -/// Get sox_encodinginfo_t for saving to file/file object -sox_encodinginfo_t get_encodinginfo_for_save( - const std::string& format, - const py::dtype dtype, - const tl::optional compression, - const tl::optional encoding, - const tl::optional bits_per_sample); - - -} // namespace sox_utils -} // namespace paddleaudio From c37782c1152682d22db254c4fdb9f7c014dd72db Mon Sep 17 00:00:00 2001 From: YangZhou Date: Fri, 29 Jul 2022 18:15:06 +0800 Subject: [PATCH 04/11] add more pybind funciton --- paddlespeech/audio/src/pybind/pybind.cpp | 55 ++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index 791ac7879..9cd12bc9e 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -5,6 +5,10 @@ #include "paddlespeech/audio/src/pybind/sox/io.h" #include "paddlespeech/audio/src/pybind/sox/effects.h" #include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h" +#include +#include +#incldue +#include PYBIND11_MODULE(_paddleaudio, m) { #ifdef INCLUDE_SOX @@ -20,9 +24,54 @@ PYBIND11_MODULE(_paddleaudio, m) { m.def("save_audio_fileobj", &paddleaudio::sox_io::save_audio_fileobj, "Save audio to file obj."); - m.def("apply_effects_fileobj", - &paddleaudio::sox_effects::apply_effects_fileobj, - "Decode audio data from file-like obj and apply effects."); + // sox io + m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file); + m.def( + "sox_io_load_audio_file", + &paddleaudio::sox_io::load_audio_file); + m.def( + "sox_io_save_audio_file", + &paddleaudio::sox_io::save_audio_file); + + // sox utils + m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed); + m.def( + "sox_utils_set_verbosity", + &paddleaudio::sox_utils::set_verbosity); + m.def( + "sox_utils_set_use_threads", + &paddleaudio::sox_utils::set_use_threads); + m.def( + "sox_utils_set_buffer_size", + &paddleaudio::sox_utils::set_buffer_size); + m.def( + "sox_utils_list_effects", + &paddleaudio::sox_utils::list_effects); + m.def( + "sox_utils_list_read_formats", + &paddleaudio::sox_utils::list_read_formats); + m.def( + "sox_utils_list_write_formats", + &paddleaudio::sox_utils::list_write_formats); + m.def( + "sox_utils_get_buffer_size", + &paddleaudio::sox_utils::get_buffer_size); + + // effect + m.def("apply_effects_fileobj", + &paddleaudio::sox_effects::apply_effects_fileobj, + "Decode audio data from file-like obj and apply effects."); + m.def("sox_effects_initialize_sox_effects", + &paddleaudio::sox_effects::initialize_sox_effects); + m.def( + "sox_effects_shutdown_sox_effects", + &paddleaudio::sox_effects::shutdown_sox_effects); + m.def( + "sox_effects_apply_effects_tensor", + &paddleaudio::sox_effects::apply_effects_tensor); + m.def( + "sox_effects_apply_effects_file", + &paddleaudio::sox_effects::apply_effects_file); #endif #ifdef INCLUDE_KALDI From 63b4494700fb76549a69cf7b380fa51d2cf940c2 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Mon, 1 Aug 2022 19:31:44 +0800 Subject: [PATCH 05/11] fix optional bind, add sox_effects --- paddlespeech/audio/backends/sox_io_backend.py | 43 ++- paddlespeech/audio/sox_effects/__init__.py | 25 ++ paddlespeech/audio/sox_effects/sox_effects.py | 283 ++++++++++++++++++ paddlespeech/audio/src/pybind/pybind.cpp | 21 +- paddlespeech/audio/utils/sox_utils.py | 101 +++++++ 5 files changed, 450 insertions(+), 23 deletions(-) create mode 100644 paddlespeech/audio/sox_effects/__init__.py create mode 100644 paddlespeech/audio/sox_effects/sox_effects.py create mode 100644 paddlespeech/audio/utils/sox_utils.py diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index b44ac30f8..750d4de1a 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -8,8 +8,7 @@ from paddle import Tensor from .common import AudioMetaData from paddlespeech.audio._internal import module_utils as _mod_utils -from paddlespeech.audio._paddleaudio import get_info_file -from paddlespeech.audio._paddleaudio import get_info_fileobj +from paddlespeech.aduio import _paddleaudio as paddleaudio #https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py @@ -43,26 +42,38 @@ _fallback_load_filebj = _fail_load_fileobj @_mod_utils.requires_sox() def load( - filepath: Union[str, Path], - out: Optional[Tensor]=None, - normalization: Union[bool, float, Callable]=True, - channels_first: bool=True, - num_frames: int=0, - offset: int=0, - filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: - raise RuntimeError("No audio I/O backend is available.") + filepath: str, + frame_offset: int = 0, + num_frames: int=-1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str]=None, ) -> Tuple[Tensor, int]: + ret = paddleaudio.sox_io_load_audio_file( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + return ret + return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) + @_mod_utils.requires_sox() def save(filepath: str, - src: Tensor, - sample_rate: int, - precision: int = 16, - channels_first: bool = True) -> None: - raise RuntimeError("No audio I/O backend is available.") + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None) -> Tuple[Tensor, int]: + ret = paddleaudio.sox_io_load_audio_file( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + return ret + return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) + @_mod_utils.requires_sox() def info(filepath: str, format: Optional[str]) -> None: - sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format) + sinfo = paddleaudio.get_info_file(filepath, format) if sinfo is not None: return AudioMetaData(*sinfo) return _fallback_info(filepath, format) diff --git a/paddlespeech/audio/sox_effects/__init__.py b/paddlespeech/audio/sox_effects/__init__.py new file mode 100644 index 000000000..d68158776 --- /dev/null +++ b/paddlespeech/audio/sox_effects/__init__.py @@ -0,0 +1,25 @@ +from paddlespeech.audio._internal import module_utils as _mod_utils + +from .sox_effects import ( + apply_effects_file, + apply_effects_tensor, + effect_names, + init_sox_effects, + shutdown_sox_effects, +) + + +if _mod_utils.is_sox_available(): + import atexit + + init_sox_effects() + atexit.register(shutdown_sox_effects) + +__all__ = [ + "init_sox_effects", + "shutdown_sox_effects", + "effect_names", + "apply_effects_tensor", + "apply_effects_file", +] + diff --git a/paddlespeech/audio/sox_effects/sox_effects.py b/paddlespeech/audio/sox_effects/sox_effects.py new file mode 100644 index 000000000..1a3f3af29 --- /dev/null +++ b/paddlespeech/audio/sox_effects/sox_effects.py @@ -0,0 +1,283 @@ +import os +from typing import List, Optional, Tuple + +from paddlespeech.audio._internal import module_utils as _mod_utils +from paddlespeech.audio.utils.sox_utils import list_effects +from paddlespeech.audio import _paddleaudio as paddleaudio + +#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py + +@_mod_utils.requires_sox() +def init_sox_effects(): + """Initialize resources required to use sox effects. + + Note: + You do not need to call this function manually. It is called automatically. + + Once initialized, you do not need to call this function again across the multiple uses of + sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet. + Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing + again will result in error. + """ + paddleaudio.sox_effects_initialize_sox_effects() + + +@_mod_utils.requires_sox() +def shutdown_sox_effects(): + """Clean up resources required to use sox effects. + + Note: + You do not need to call this function manually. It is called automatically. + + It is safe to call this function multiple times. + Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and + initializing again will result in error. + """ + paddleaudio.sox_effects_shutdown_sox_effects() + + +@_mod_utils.requires_sox() +def effect_names() -> List[str]: + """Gets list of valid sox effect names + + Returns: + List[str]: list of available effect names. + + Example + >>> paddleaudio.sox_effects.effect_names() + ['allpass', 'band', 'bandpass', ... ] + """ + return list(list_effects().keys()) + + +@_mod_utils.requires_sox() +def apply_effects_tensor( + tensor: torch.Tensor, + sample_rate: int, + effects: List[List[str]], + channels_first: bool = True, +) -> Tuple[torch.Tensor, int]: + """Apply sox effects to given Tensor + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + This function only works on CPU Tensors. + This function works in the way very similar to ``sox`` command, however there are slight + differences. For example, ``sox`` command adds certain effects automatically (such as + ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does + only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also + need to give ``rate`` effect with desired sampling rate.). + + Args: + tensor (torch.Tensor): Input 2D CPU Tensor. + sample_rate (int): Sample rate + effects (List[List[str]]): List of effects. + channels_first (bool, optional): Indicates if the input Tensor's dimension is + `[channels, time]` or `[time, channels]` + + Returns: + (Tensor, int): Resulting Tensor and sample rate. + The resulting Tensor has the same ``dtype`` as the input Tensor, and + the same channels order. The shape of the Tensor can be different based on the + effects applied. Sample rate can also be different based on the effects applied. + + Example - Basic usage + >>> + >>> # Defines the effects to apply + >>> effects = [ + ... ['gain', '-n'], # normalises to 0dB + ... ['pitch', '5'], # 5 cent pitch shift + ... ['rate', '8000'], # resample to 8000 Hz + ... ] + >>> + >>> # Generate pseudo wave: + >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second + >>> sample_rate = 16000 + >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1 + >>> waveform.shape + torch.Size([2, 16000]) + >>> waveform + tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442], + [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]]) + >>> + >>> # Apply effects + >>> waveform, sample_rate = apply_effects_tensor( + ... wave_form, sample_rate, effects, channels_first=True) + >>> + >>> # Check the result + >>> # The new waveform is sampling rate 8000, 1 second. + >>> # normalization and channel order are preserved + >>> waveform.shape + torch.Size([2, 8000]) + >>> waveform + tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110], + [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]]) + >>> sample_rate + 8000 + + Example - Torchscript-able transform + >>> + >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file, + >>> # then run sox effect via Torchscript runtime. + >>> + >>> class SoxEffectTransform(torch.nn.Module): + ... effects: List[List[str]] + ... + ... def __init__(self, effects: List[List[str]]): + ... super().__init__() + ... self.effects = effects + ... + ... def forward(self, tensor: torch.Tensor, sample_rate: int): + ... return sox_effects.apply_effects_tensor( + ... tensor, sample_rate, self.effects) + ... + ... + >>> # Create transform object + >>> effects = [ + ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter + ... ["rate", "8000"], # change sample rate to 8000 + ... ] + >>> transform = SoxEffectTensorTransform(effects, input_sample_rate) + >>> + >>> # Dump it to file and load + >>> path = 'sox_effect.zip' + >>> torch.jit.script(trans).save(path) + >>> transform = torch.jit.load(path) + >>> + >>>> # Run transform + >>> waveform, input_sample_rate = paddleaudio.load("input.wav") + >>> waveform, sample_rate = transform(waveform, input_sample_rate) + >>> assert sample_rate == 8000 + """ + return paddleaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first) + + +@_mod_utils.requires_sox() +def apply_effects_file( + path: str, + effects: List[List[str]], + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Apply sox effects to the audio file and load the resulting data as Tensor + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + This function works in the way very similar to ``sox`` command, however there are slight + differences. For example, ``sox`` commnad adds certain effects automatically (such as + ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given + effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate`` + effect with desired sampling rate, because internally, ``speed`` effects only alter sampling + rate and leave samples untouched. + + Args: + path (path-like object or file-like object): + Source of audio data. When the function is not compiled by TorchScript, + (e.g. ``torch.jit.script``), the following types are accepted: + + * ``path-like``: file path + * ``file-like``: Object with ``read(size: int) -> bytes`` method, + which returns byte string of at most ``size`` length. + + When the function is compiled by TorchScript, only ``str`` type is allowed. + + Note: This argument is intentionally annotated as ``str`` only for + TorchScript compiler compatibility. + effects (List[List[str]]): List of effects. + normalize (bool, optional): + When ``True``, this function always return ``float32``, and sample values are + normalized to ``[-1.0, 1.0]``. + If input file is integer WAV, giving ``False`` will change the resulting Tensor type to + integer type. This argument has no effect for formats other + than integer WAV type. + channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Override the format detection with the given format. + Providing the argument might help when libsox can not infer the format + from header or extension, + + Returns: + (Tensor, int): Resulting Tensor and sample rate. + If ``normalize=True``, the resulting Tensor is always ``float32`` type. + If ``normalize=False`` and the input audio file is of integer WAV file, then the + resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported) + If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`, + otherwise `[time, channel]`. + + Example - Basic usage + >>> + >>> # Defines the effects to apply + >>> effects = [ + ... ['gain', '-n'], # normalises to 0dB + ... ['pitch', '5'], # 5 cent pitch shift + ... ['rate', '8000'], # resample to 8000 Hz + ... ] + >>> + >>> # Apply effects and load data with channels_first=True + >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True) + >>> + >>> # Check the result + >>> waveform.shape + torch.Size([2, 8000]) + >>> waveform + tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07, + -1.4761e-07, 1.8114e-07], + [-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07, + -5.6159e-07, 4.8103e-07]]) + >>> sample_rate + 8000 + + Example - Apply random speed perturbation to dataset + >>> + >>> # Load data from file, apply random speed perturbation + >>> class RandomPerturbationFile(torch.utils.data.Dataset): + ... \"\"\"Given flist, apply random speed perturbation + ... + ... Suppose all the input files are at least one second long. + ... \"\"\" + ... def __init__(self, flist: List[str], sample_rate: int): + ... super().__init__() + ... self.flist = flist + ... self.sample_rate = sample_rate + ... + ... def __getitem__(self, index): + ... speed = 0.5 + 1.5 * random.randn() + ... effects = [ + ... ['gain', '-n', '-10'], # apply 10 db attenuation + ... ['remix', '-'], # merge all the channels + ... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds. + ... ['rate', f'{self.sample_rate}'], + ... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end + ... ['trim', '0', '2'], # get the first 2 seconds + ... ] + ... waveform, _ = paddleaudio.sox_effects.apply_effects_file( + ... self.flist[index], effects) + ... return waveform + ... + ... def __len__(self): + ... return len(self.flist) + ... + >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000) + >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32) + >>> for batch in loader: + >>> pass + """ + if not torch.jit.is_scripting(): + if hasattr(path, "read"): + ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format) + if ret is None: + raise RuntimeError("Failed to load audio from {}".format(path)) + return ret + path = os.fspath(path) + ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format) + if ret is not None: + return ret + raise RuntimeError("Failed to load audio from {}".format(path)) \ No newline at end of file diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index 9cd12bc9e..776e43a7e 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -5,17 +5,23 @@ #include "paddlespeech/audio/src/pybind/sox/io.h" #include "paddlespeech/audio/src/pybind/sox/effects.h" #include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h" + #include -#include -#incldue -#include +#include + +// `tl::optional` +namespace pybind11 { namespace detail { + template + struct type_caster> : optional_caster> {}; +}} PYBIND11_MODULE(_paddleaudio, m) { #ifdef INCLUDE_SOX m.def("get_info_file", &paddleaudio::sox_io::get_info_file, "Get metadata of audio file."); - m.def("get_info_fileobj", + // support obj later + /*m.def("get_info_fileobj", &paddleaudio::sox_io::get_info_fileobj, "Get metadata of audio in file object."); m.def("load_audio_fileobj", @@ -24,6 +30,7 @@ PYBIND11_MODULE(_paddleaudio, m) { m.def("save_audio_fileobj", &paddleaudio::sox_io::save_audio_fileobj, "Save audio to file obj."); + */ // sox io m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file); m.def( @@ -58,9 +65,9 @@ PYBIND11_MODULE(_paddleaudio, m) { &paddleaudio::sox_utils::get_buffer_size); // effect - m.def("apply_effects_fileobj", - &paddleaudio::sox_effects::apply_effects_fileobj, - "Decode audio data from file-like obj and apply effects."); + //m.def("apply_effects_fileobj", + // &paddleaudio::sox_effects::apply_effects_fileobj, + // "Decode audio data from file-like obj and apply effects."); m.def("sox_effects_initialize_sox_effects", &paddleaudio::sox_effects::initialize_sox_effects); m.def( diff --git a/paddlespeech/audio/utils/sox_utils.py b/paddlespeech/audio/utils/sox_utils.py new file mode 100644 index 000000000..fb19ff316 --- /dev/null +++ b/paddlespeech/audio/utils/sox_utils.py @@ -0,0 +1,101 @@ +from typing import Dict, List + +from paddlespeech.audio._internal import module_utils as _mod_utils +from paddlespeech.audio import _paddleaudio + +@_mod_utils.requires_sox() +def set_seed(seed: int): + """Set libsox's PRNG + + Args: + seed (int): seed value. valid range is int32. + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_seed(seed) + + +@_mod_utils.requires_sox() +def set_verbosity(verbosity: int): + """Set libsox's verbosity + + Args: + verbosity (int): Set verbosity level of libsox. + + * ``1`` failure messages + * ``2`` warnings + * ``3`` details of processing + * ``4``-``6`` increasing levels of debug messages + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_verbosity(verbosity) + + +@_mod_utils.requires_sox() +def set_buffer_size(buffer_size: int): + """Set buffer size for sox effect chain + + Args: + buffer_size (int): Set the size in bytes of the buffers used for processing audio. + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_buffer_size(buffer_size) + + +@_mod_utils.requires_sox() +def set_use_threads(use_threads: bool): + """Set multithread option for sox effect chain + + Args: + use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing. + To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support. + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_use_threads(use_threads) + + +@_mod_utils.requires_sox() +def list_effects() -> Dict[str, str]: + """List the available sox effect names + + Returns: + Dict[str, str]: Mapping from ``effect name`` to ``usage`` + """ + return dict(_paddleaudio.sox_utils_list_effects()) + + +@_mod_utils.requires_sox() +def list_read_formats() -> List[str]: + """List the supported audio formats for read + + Returns: + List[str]: List of supported audio formats + """ + return _paddleaudio.sox_utils_list_read_formats() + + +@_mod_utils.requires_sox() +def list_write_formats() -> List[str]: + """List the supported audio formats for write + + Returns: + List[str]: List of supported audio formats + """ + return _paddleaudio.sox_utils_list_write_formats() + + +@_mod_utils.requires_sox() +def get_buffer_size() -> int: + """Get buffer size for sox effect chain + + Returns: + int: size in bytes of buffers used for processing audio. + """ + return _paddleaudio.sox_utils_get_buffer_size() From 59d82c0c65566477f6adc2c324a8ee0ce59cf853 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Tue, 2 Aug 2022 21:44:23 +0800 Subject: [PATCH 06/11] add test_load.py --- paddlespeech/audio/_extension.py | 1 - paddlespeech/audio/backends/sox_io_backend.py | 2 +- paddlespeech/audio/src/pybind/sox/utils.cpp | 44 +++++++-- tests/unit/audio/backends/sox_io/common.py | 32 +++++++ tests/unit/audio/backends/sox_io/info_test.py | 34 +++++++ tests/unit/audio/backends/sox_io/load_test.py | 47 ++++++++++ tests/unit/audio/backends/sox_io/save_test.py | 34 +++++++ tests/unit/audio/backends/sox_io/testdata | 1 + tests/unit/common_utils/__init__.py | 8 ++ tests/unit/common_utils/wav_utils.py | 92 +++++++++++++++++++ 10 files changed, 287 insertions(+), 8 deletions(-) create mode 100644 tests/unit/audio/backends/sox_io/common.py create mode 100644 tests/unit/audio/backends/sox_io/info_test.py create mode 100644 tests/unit/audio/backends/sox_io/load_test.py create mode 100644 tests/unit/audio/backends/sox_io/save_test.py create mode 120000 tests/unit/audio/backends/sox_io/testdata create mode 100644 tests/unit/common_utils/__init__.py create mode 100644 tests/unit/common_utils/wav_utils.py diff --git a/paddlespeech/audio/_extension.py b/paddlespeech/audio/_extension.py index 000fae131..ac82c06e5 100644 --- a/paddlespeech/audio/_extension.py +++ b/paddlespeech/audio/_extension.py @@ -103,7 +103,6 @@ def _load_lib(lib: str) -> bool: If a dependency is missing, then users have to install it. """ path = _get_lib_path(lib) - warnings.warn("lib path is :" + str(path)) if not path.exists(): warnings.warn("lib path is not exists:" + str(path)) return False diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index 750d4de1a..c75894181 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -8,7 +8,7 @@ from paddle import Tensor from .common import AudioMetaData from paddlespeech.audio._internal import module_utils as _mod_utils -from paddlespeech.aduio import _paddleaudio as paddleaudio +from paddlespeech.audio import _paddleaudio as paddleaudio #https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py diff --git a/paddlespeech/audio/src/pybind/sox/utils.cpp b/paddlespeech/audio/src/pybind/sox/utils.cpp index a930f8cdd..5c78bc116 100644 --- a/paddlespeech/audio/src/pybind/sox/utils.cpp +++ b/paddlespeech/audio/src/pybind/sox/utils.cpp @@ -178,36 +178,68 @@ py::array convert_to_tensor( const py::dtype dtype, const bool normalize, const bool channels_first) { + // todo refector later(SGoat) py::array t; uint64_t dummy = 0; SOX_SAMPLE_LOCALS; + int32_t num_rows = num_samples / num_channels; if (normalize || dtype.char_() == 'f') { - t = py::array(dtype, {num_samples / num_channels, num_channels}); + t = py::array(dtype, {num_rows, num_channels}); auto ptr = (float*)t.mutable_data(0, 0); for (int32_t i = 0; i < num_samples; ++i) { ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy); } + if (channels_first) { + py::array t2 = py::array(dtype, {num_channels, num_rows}); + for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) { + for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx) + *(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx); + } + return t2; + } } else if (dtype.char_() == 'i') { - //t = torch::from_blob( - // buffer, {num_samples / num_channels, num_channels}, torch::kInt32) - // .clone(); - t = py::array(dtype, {num_samples / num_channels, num_channels}); + t = py::array(dtype, {num_rows, num_channels}); auto ptr = (int*)t.mutable_data(0, 0); for (int32_t i = 0; i < num_samples; ++i) { ptr[i] = buffer[i]; } + if (channels_first) { + py::array t2 = py::array(dtype, {num_channels, num_rows}); + for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) { + for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx) + *(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx); + } + return t2; + } } else if (dtype.char_() == 'h') { // int16 - t = py::array(dtype, {num_samples / num_channels, num_channels}); + t = py::array(dtype, {num_rows, num_channels}); auto ptr = (int16_t*)t.mutable_data(0, 0); for (int32_t i = 0; i < num_samples; ++i) { ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy); } + if (channels_first) { + py::array t2 = py::array(dtype, {num_channels, num_rows}); + for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) { + for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx) + *(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx); + } + return t2; + } } else if (dtype.char_() == 'b') { //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8); + t = py::array(dtype, {num_rows, num_channels}); auto ptr = (uint8_t*)t.mutable_data(0,0); for (int32_t i = 0; i < num_samples; ++i) { ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy); } + if (channels_first) { + py::array t2 = py::array(dtype, {num_channels, num_rows}); + for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) { + for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx) + *(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx); + } + return t2; + } } else { throw std::runtime_error("Unsupported dtype."); } diff --git a/tests/unit/audio/backends/sox_io/common.py b/tests/unit/audio/backends/sox_io/common.py new file mode 100644 index 000000000..79b922a91 --- /dev/null +++ b/tests/unit/audio/backends/sox_io/common.py @@ -0,0 +1,32 @@ + +def get_encoding(ext, dtype): + exts = { + "mp3", + "flac", + "vorbis", + } + encodings = { + "float32": "PCM_F", + "int32": "PCM_S", + "int16": "PCM_S", + "uint8": "PCM_U", + } + return ext.upper() if ext in exts else encodings[dtype] + + +def get_bit_depth(dtype): + bit_depths = { + "float32": 32, + "int32": 32, + "int16": 16, + "uint8": 8, + } + return bit_depths[dtype] + +def get_bits_per_sample(ext, dtype): + bits_per_samples = { + "flac": 24, + "mp3": 0, + "vorbis": 0, + } + return bits_per_samples.get(ext, get_bit_depth(dtype)) diff --git a/tests/unit/audio/backends/sox_io/info_test.py b/tests/unit/audio/backends/sox_io/info_test.py new file mode 100644 index 000000000..ae18a29ef --- /dev/null +++ b/tests/unit/audio/backends/sox_io/info_test.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle + +from paddlespeech.audio.backends import sox_io_backend + +class TestInfo(unittest.TestCase): + + def test_wav(self, dtype, sample_rate, num_channels, sample_size): + """check wav file correctly """ + path = 'testdata/test.wav' + info = sox_io_backend.get_info_file(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_size # duration*sample_rate + assert info.num_channels == num_channels + assert info.bits_per_sample == get_bit_depth(dtype) + assert info.encoding == get_encoding('wav', dtype) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/sox_io/load_test.py b/tests/unit/audio/backends/sox_io/load_test.py new file mode 100644 index 000000000..8e141750b --- /dev/null +++ b/tests/unit/audio/backends/sox_io/load_test.py @@ -0,0 +1,47 @@ +import unittest +import itertools + +from parameterized import parameterized +import numpy as np +from paddlespeech.audio._internal import module_utils as _mod_utils +from paddlespeech.audio.backends import sox_io_backend + +from tests.unit.common_utils import ( + get_wav_data, + load_wav, + save_wav, +) + +#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/load_test.py + +class TestLoad(unittest.TestCase): + + def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration): + """`sox_io_backend.load` can load wav format correctly. + + Wav data loaded with sox_io backend should match those with scipy + """ + path = 'testdata/reference.wav' + data = get_wav_data(dtype, num_channels, normalize=normalize, num_frames=duration * sample_rate) + save_wav(path, data, sample_rate) + expected = load_wav(path, normalize=normalize)[0] + data, sr = sox_io_backend.load(path, normalize=normalize) + assert sr == sample_rate + np.testing.assert_array_almost_equal(data, expected, decimal=4) + + @parameterized.expand( + list( + itertools.product( + ["float64", "float32", "int32",], + [8000, 16000], + [1, 2], + [False, True], + ) + ), + ) + def test_wav(self, dtype, sample_rate, num_channels, normalize): + """`sox_io_backend.load` can load wav format correctly.""" + self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/sox_io/save_test.py b/tests/unit/audio/backends/sox_io/save_test.py new file mode 100644 index 000000000..ae18a29ef --- /dev/null +++ b/tests/unit/audio/backends/sox_io/save_test.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle + +from paddlespeech.audio.backends import sox_io_backend + +class TestInfo(unittest.TestCase): + + def test_wav(self, dtype, sample_rate, num_channels, sample_size): + """check wav file correctly """ + path = 'testdata/test.wav' + info = sox_io_backend.get_info_file(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_size # duration*sample_rate + assert info.num_channels == num_channels + assert info.bits_per_sample == get_bit_depth(dtype) + assert info.encoding == get_encoding('wav', dtype) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/sox_io/testdata b/tests/unit/audio/backends/sox_io/testdata new file mode 120000 index 000000000..485a3dd63 --- /dev/null +++ b/tests/unit/audio/backends/sox_io/testdata @@ -0,0 +1 @@ +../../features/testdata \ No newline at end of file diff --git a/tests/unit/common_utils/__init__.py b/tests/unit/common_utils/__init__.py new file mode 100644 index 000000000..dae409f3c --- /dev/null +++ b/tests/unit/common_utils/__init__.py @@ -0,0 +1,8 @@ +from .wav_utils import get_wav_data, load_wav, save_wav, normalize_wav + +__all__ = [ + "get_wav_data", + "load_wav", + "save_wav", + "normalize_wav" +] diff --git a/tests/unit/common_utils/wav_utils.py b/tests/unit/common_utils/wav_utils.py new file mode 100644 index 000000000..dbdd453e0 --- /dev/null +++ b/tests/unit/common_utils/wav_utils.py @@ -0,0 +1,92 @@ +from typing import Optional + +import scipy.io.wavfile +import paddle + +def normalize_wav(tensor: paddle.Tensor) -> paddle.Tensor: + if tensor.dtype == paddle.float32: + pass + elif tensor.dtype == paddle.int32: + tensor = paddle.cast(tensor, paddle.float32) + tensor[tensor > 0] /= 2147483647.0 + tensor[tensor < 0] /= 2147483648.0 + elif tensor.dtype == paddle.int16: + tensor = paddle.cast(tensor, paddle.float32) + tensor[tensor > 0] /= 32767.0 + tensor[tensor < 0] /= 32768.0 + elif tensor.dtype == paddle.uint8: + tensor = paddle.cast(tensor, paddle.float32) - 128 + tensor[tensor > 0] /= 127.0 + tensor[tensor < 0] /= 128.0 + return tensor + + +def get_wav_data( + dtype: str, + num_channels: int, + *, + num_frames: Optional[int] = None, + normalize: bool = True, + channels_first: bool = True, +): + """Generate linear signal of the given dtype and num_channels + + Data range is + [-1.0, 1.0] for float32, + [-2147483648, 2147483647] for int32 + [-32768, 32767] for int16 + [0, 255] for uint8 + + num_frames allow to change the linear interpolation parameter. + Default values are 256 for uint8, else 1 << 16. + 1 << 16 as default is so that int16 value range is completely covered. + """ + dtype_ = getattr(paddle, dtype) + + if num_frames is None: + if dtype == "uint8": + num_frames = 256 + else: + num_frames = 1 << 16 + + # paddle linspace not support uint8, int8, int16 + #if dtype == "uint8": + # base = paddle.linspace(0, 255, num_frames, dtype=dtype_) + #elif dtype == "int8": + # base = paddle.linspace(-128, 127, num_frames, dtype=dtype_) + if dtype == "float32": + base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "float64": + base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "int32": + base = paddle.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) + #elif dtype == "int16": + # base = paddle.linspace(-32768, 32767, num_frames, dtype=dtype_) + else: + raise NotImplementedError(f"Unsupported dtype {dtype}") + data = base.tile([num_channels, 1]) + if not channels_first: + data = data.transpose([1, 0]) + if normalize: + data = normalize_wav(data) + return data + + +def load_wav(path: str, normalize=True, channels_first=True) -> paddle.Tensor: + """Load wav file without paddleaudio""" + sample_rate, data = scipy.io.wavfile.read(path) + data = paddle.to_tensor(data.copy()) + if data.ndim == 1: + data = data.unsqueeze(1) + if normalize: + data = normalize_wav(data) + if channels_first: + data = data.transpose([1, 0]) + return data, sample_rate + + +def save_wav(path, data, sample_rate, channels_first=True): + """Save wav file without paddleaudio""" + if channels_first: + data = data.transpose([1, 0]) + scipy.io.wavfile.write(path, sample_rate, data.numpy()) From d264118416dffbdbd760248e0e2869a9c8722a72 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Tue, 9 Aug 2022 22:32:36 +0800 Subject: [PATCH 07/11] add save test&&fix effects_chain bug --- paddlespeech/audio/backends/sox_io_backend.py | 54 +++-- paddlespeech/audio/src/pybind/pybind.cpp | 4 +- .../audio/src/pybind/sox/effects_chain.cpp | 74 +++++-- tests/unit/audio/backends/sox_io/save_test.py | 195 +++++++++++++++--- tests/unit/common_utils/__init__.py | 8 +- tests/unit/common_utils/case_utils.py | 56 +++++ .../unit/common_utils/parameterized_utils.py | 50 +++++ tests/unit/common_utils/sox_utils.py | 116 +++++++++++ 8 files changed, 490 insertions(+), 67 deletions(-) create mode 100644 tests/unit/common_utils/case_utils.py create mode 100644 tests/unit/common_utils/parameterized_utils.py create mode 100644 tests/unit/common_utils/sox_utils.py diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index c75894181..beb6ddb9d 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -1,11 +1,11 @@ from pathlib import Path from typing import Callable -from typing import Optional -from typing import Tuple -from typing import Union +from typing import Optional, Tuple, Union +import paddle from paddle import Tensor from .common import AudioMetaData +import os from paddlespeech.audio._internal import module_utils as _mod_utils from paddlespeech.audio import _paddleaudio as paddleaudio @@ -48,31 +48,53 @@ def load( normalize: bool = True, channels_first: bool = True, format: Optional[str]=None, ) -> Tuple[Tensor, int]: + if hasattr(filepath, "read"): + ret = paddleaudio.load_audio_fileobj( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + audio_tensor = paddle.to_tensor(ret[0]) + return (audio_tensor, ret[1]) + return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format) + filepath = os.fspath(filepath) ret = paddleaudio.sox_io_load_audio_file( filepath, frame_offset, num_frames, normalize, channels_first, format ) if ret is not None: - return ret + audio_tensor = paddle.to_tensor(ret[0]) + return (audio_tensor, ret[1]) return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) @_mod_utils.requires_sox() -def save(filepath: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None) -> Tuple[Tensor, int]: - ret = paddleaudio.sox_io_load_audio_file( - filepath, frame_offset, num_frames, normalize, channels_first, format +def save(filepath: str, + src: Tensor, + sample_rate: int, + channels_first: bool = True, + compression: Optional[float] = None, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, +): + src_arr = src.numpy() + if hasattr(filepath, "write"): + paddleaudio.save_audio_fileobj( + filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample + ) + return + filepath = os.fspath(filepath) + paddleaudio.sox_io_save_audio_file( + filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample ) - if ret is not None: - return ret - return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) - @_mod_utils.requires_sox() def info(filepath: str, format: Optional[str]) -> None: + if hasattr(filepath, "read"): + sinfo = paddleaudio.get_info_fileojb(filepath, format) + if sinfo is not None: + return AudioMetaData(*sinfo) + return _fallback_info_fileobj(filepath, format) + filepath = os.fspath(filepath) sinfo = paddleaudio.get_info_file(filepath, format) if sinfo is not None: return AudioMetaData(*sinfo) diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index 776e43a7e..24cf0eb18 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -21,7 +21,7 @@ PYBIND11_MODULE(_paddleaudio, m) { &paddleaudio::sox_io::get_info_file, "Get metadata of audio file."); // support obj later - /*m.def("get_info_fileobj", + m.def("get_info_fileobj", &paddleaudio::sox_io::get_info_fileobj, "Get metadata of audio in file object."); m.def("load_audio_fileobj", @@ -30,7 +30,7 @@ PYBIND11_MODULE(_paddleaudio, m) { m.def("save_audio_fileobj", &paddleaudio::sox_io::save_audio_fileobj, "Save audio to file obj."); - */ + // sox io m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file); m.def( diff --git a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp index 4ad90da36..15fc6d26e 100644 --- a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp +++ b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp @@ -1,5 +1,6 @@ #include - +#include +#include #include "paddlespeech/audio/src/pybind/sox/effects_chain.h" #include "paddlespeech/audio/src/pybind/sox/utils.h" @@ -42,6 +43,7 @@ int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { if (index + *osamp > num_samples) { *osamp = num_samples - index; } + // Ensure that it's a multiple of the number of channels *osamp -= *osamp % num_channels; @@ -49,52 +51,80 @@ int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { // refacor this module, chunk auto i_frame = index / num_channels; auto num_frames = *osamp / num_channels; - py::array chunk(tensor.dtype(), {num_frames*num_channels}); + + std::vector chunk(num_frames*num_channels); py::buffer_info ori_info = tensor.request(); - py::buffer_info info = chunk.request(); - char* ori_start_ptr = (char*)ori_info.ptr + index * chunk.itemsize() / sizeof(char); - std::memcpy(info.ptr, ori_start_ptr, chunk.nbytes()); - - py::dtype chunk_type = py::dtype("i"); // dtype int32 - py::array new_chunk = py::array(chunk_type, chunk.shape()); - py::buffer_info new_info = new_chunk.request(); - void* ptr = (void*) info.ptr; - int* new_ptr = (int*) new_info.ptr; + void* ptr = ori_info.ptr; // Convert to sox_sample_t (int32_t) - switch (chunk.dtype().num()) { + switch (tensor.dtype().num()) { //case c10::ScalarType::Float: { case 11: { + break; // Need to convert to 64-bit precision so that // values around INT32_MIN/MAX are handled correctly. - float* ptr_f = (float*)ptr; for (int idx = 0; idx < chunk.size(); ++idx) { - double elem = *ptr_f * 2147483648.; + int frame_idx = (idx + index) / num_channels; + int channels_idx = (idx + index) % num_channels; + double elem = 0; + if (priv->channels_first) { + elem = *(float*)tensor.data(channels_idx, frame_idx); + } else { + elem = *(float*)tensor.data(frame_idx, channels_idx); + } + elem = elem * 2147483648.; // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX); if (elem > INT32_MAX) { - *new_ptr = INT32_MAX; + chunk[idx] = INT32_MAX; } else if (elem < INT32_MIN) { - *new_ptr = INT32_MIN; - } else { *new_ptr = elem; } + chunk[idx] = INT32_MIN; + } else { + chunk[idx] = elem; + } } break; } //case c10::ScalarType::Int: { case 5: { + for (int idx = 0; idx < chunk.size(); ++idx) { + int frame_idx = (idx + index) / num_channels; + int channels_idx = (idx + index) % num_channels; + int elem = 0; + if (priv->channels_first) { + elem = *(int*)tensor.data(channels_idx, frame_idx); + } else { + elem = *(int*)tensor.data(frame_idx, channels_idx); + } + chunk[idx] = elem; + } break; } // case short case 3: { - int16_t* ptr_s = (int16_t*) ptr; for (int idx = 0; idx < chunk.size(); ++idx) { - *new_ptr = *ptr_s * 65536; + int frame_idx = (idx + index) / num_channels; + int channels_idx = (idx + index) % num_channels; + int16_t elem = 0; + if (priv->channels_first) { + elem = *(int16_t*)tensor.data(channels_idx, frame_idx); + } else { + elem = *(int16_t*)tensor.data(frame_idx, channels_idx); + } + chunk[idx] = elem * 65536; } break; } // case byte case 1: { - int8_t* ptr_b = (int8_t*) ptr; for (int idx = 0; idx < chunk.size(); ++idx) { - *new_ptr = (*ptr_b - 128) * 16777216; + int frame_idx = (idx + index) / num_channels; + int channels_idx = (idx + index) % num_channels; + int8_t elem = 0; + if (priv->channels_first) { + elem = *(int8_t*)tensor.data(channels_idx, frame_idx); + } else { + elem = *(int8_t*)tensor.data(frame_idx, channels_idx); + } + chunk[idx] = (elem - 128) * 16777216; } break; } @@ -102,7 +132,7 @@ int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { throw std::runtime_error("Unexpected dtype."); } // Write to buffer - memcpy(obuf, (int*)new_info.ptr, *osamp * 4); + memcpy(obuf, chunk.data(), *osamp * 4); priv->index += *osamp; return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS; } diff --git a/tests/unit/audio/backends/sox_io/save_test.py b/tests/unit/audio/backends/sox_io/save_test.py index ae18a29ef..269c502a3 100644 --- a/tests/unit/audio/backends/sox_io/save_test.py +++ b/tests/unit/audio/backends/sox_io/save_test.py @@ -1,34 +1,177 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +import io +import os import unittest import numpy as np import paddle - +from parameterized import parameterized from paddlespeech.audio.backends import sox_io_backend -class TestInfo(unittest.TestCase): - - def test_wav(self, dtype, sample_rate, num_channels, sample_size): - """check wav file correctly """ - path = 'testdata/test.wav' - info = sox_io_backend.get_info_file(path) - assert info.sample_rate == sample_rate - assert info.num_frames == sample_size # duration*sample_rate - assert info.num_channels == num_channels - assert info.bits_per_sample == get_bit_depth(dtype) - assert info.encoding == get_encoding('wav', dtype) - +from tests.unit.common_utils import ( + get_wav_data, + load_wav, + save_wav, + nested_params, + TempDirMixin, + sox_utils +) + +#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/save_test.py + +def _get_sox_encoding(encoding): + encodings = { + "PCM_F": "floating-point", + "PCM_S": "signed-integer", + "PCM_U": "unsigned-integer", + "ULAW": "u-law", + "ALAW": "a-law", + } + return encodings.get(encoding) + +class TestSaveBase(TempDirMixin): + def assert_save_consistency( + self, + format: str, + *, + compression: float = None, + encoding: str = None, + bits_per_sample: int = None, + sample_rate: float = 8000, + num_channels: int = 2, + num_frames: float = 3 * 8000, + src_dtype: str = "int32", + test_mode: str = "path", + ): + """`save` function produces file that is comparable with `sox` command + + To compare that the file produced by `save` function agains the file produced by + the equivalent `sox` command, we need to load both files. + But there are many formats that cannot be opened with common Python modules (like + SciPy). + So we use `sox` command to prepare the original data and convert the saved files + into a format that SciPy can read (PCM wav). + The following diagram illustrates this process. The difference is 2.1. and 3.1. + + This assumes that + - loading data with SciPy preserves the data well. + - converting the resulting files into WAV format with `sox` preserve the data well. + + x + | 1. Generate source wav file with SciPy + | + v + -------------- wav ---------------- + | | + | 2.1. load with scipy | 3.1. Convert to the target + | then save it into the target | format depth with sox + | format with torchaudio | + v v + target format target format + | | + | 2.2. Convert to wav with sox | 3.2. Convert to wav with sox + | | + v v + wav wav + | | + | 2.3. load with scipy | 3.3. load with scipy + | | + v v + tensor -------> compare <--------- tensor + + """ + cmp_encoding = "floating-point" + cmp_bit_depth = 32 + + src_path = self.get_temp_path("1.source.wav") + tgt_path = self.get_temp_path(f"2.1.torchaudio.{format}") + tst_path = self.get_temp_path("2.2.result.wav") + sox_path = self.get_temp_path(f"3.1.sox.{format}") + ref_path = self.get_temp_path("3.2.ref.wav") + + # 1. Generate original wav + data = get_wav_data(src_dtype, num_channels, normalize=False, num_frames=num_frames) + save_wav(src_path, data, sample_rate) + + # 2.1. Convert the original wav to target format with torchaudio + data = load_wav(src_path, normalize=False)[0] + if test_mode == "path": + sox_io_backend.save( + tgt_path, data, sample_rate, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample + ) + elif test_mode == "fileobj": + with open(tgt_path, "bw") as file_: + sox_io_backend.save( + file_, + data, + sample_rate, + format=format, + compression=compression, + encoding=encoding, + bits_per_sample=bits_per_sample, + ) + elif test_mode == "bytesio": + file_ = io.BytesIO() + sox_io_backend.save( + file_, + data, + sample_rate, + format=format, + compression=compression, + encoding=encoding, + bits_per_sample=bits_per_sample, + ) + file_.seek(0) + with open(tgt_path, "bw") as f: + f.write(file_.read()) + else: + raise ValueError(f"Unexpected test mode: {test_mode}") + # 2.2. Convert the target format to wav with sox + sox_utils.convert_audio_file(tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth) + # 2.3. Load with SciPy + found = load_wav(tst_path, normalize=False)[0] + + # 3.1. Convert the original wav to target format with sox + sox_encoding = _get_sox_encoding(encoding) + sox_utils.convert_audio_file( + src_path, sox_path, compression=compression, encoding=sox_encoding, bit_depth=bits_per_sample + ) + # 3.2. Convert the target format to wav with sox + sox_utils.convert_audio_file(sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth) + # 3.3. Load with SciPy + expected = load_wav(ref_path, normalize=False)[0] + + np.testing.assert_array_almost_equal(found, expected) + +class TestSave(TestSaveBase, unittest.TestCase): + @nested_params( + ["path",], + [ + ("PCM_U", 8), + ("PCM_S", 16), + ("PCM_S", 32), + ("PCM_F", 32), + ("PCM_F", 64), + ("ULAW", 8), + ("ALAW", 8), + ], + ) + def test_save_wav(self, test_mode, enc_params): + encoding, bits_per_sample = enc_params + self.assert_save_consistency("wav", encoding=encoding, bits_per_sample=bits_per_sample, test_mode=test_mode) + + @nested_params( + ["path", ], + [ + ("float32",), + ("int32",), + ("int16",), + ("uint8",), + ], + ) + def test_save_wav_dtype(self, test_mode, params): + (dtype,) = params + self.assert_save_consistency("wav", src_dtype=dtype, test_mode=test_mode) + + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/tests/unit/common_utils/__init__.py b/tests/unit/common_utils/__init__.py index dae409f3c..722a9789f 100644 --- a/tests/unit/common_utils/__init__.py +++ b/tests/unit/common_utils/__init__.py @@ -1,8 +1,14 @@ from .wav_utils import get_wav_data, load_wav, save_wav, normalize_wav +from .parameterized_utils import load_params, nested_params +from .case_utils import ( + TempDirMixin +) __all__ = [ "get_wav_data", "load_wav", "save_wav", - "normalize_wav" + "normalize_wav", + "load_params", + "nested_params", ] diff --git a/tests/unit/common_utils/case_utils.py b/tests/unit/common_utils/case_utils.py new file mode 100644 index 000000000..cee2f29c8 --- /dev/null +++ b/tests/unit/common_utils/case_utils.py @@ -0,0 +1,56 @@ +import functools +import os.path +import shutil +import subprocess +import sys +import tempfile +import time +import unittest + +import paddle +from paddlespeech.audio._internal.module_utils import ( + is_kaldi_available, + is_module_available, + is_sox_available, +) + +class TempDirMixin: + """Mixin to provide easy access to temp dir""" + + temp_dir_ = None + + @classmethod + def get_base_temp_dir(cls): + # If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory. + # this is handy for debugging. + key = "TORCHAUDIO_TEST_TEMP_DIR" + if key in os.environ: + return os.environ[key] + if cls.temp_dir_ is None: + cls.temp_dir_ = tempfile.TemporaryDirectory() + return cls.temp_dir_.name + + @classmethod + def tearDownClass(cls): + if cls.temp_dir_ is not None: + try: + cls.temp_dir_.cleanup() + cls.temp_dir_ = None + except PermissionError: + # On Windows there is a know issue with `shutil.rmtree`, + # which fails intermittenly. + # + # https://github.com/python/cpython/issues/74168 + # + # We observed this on CircleCI, where Windows job raises + # PermissionError. + # + # Following the above thread, we ignore it. + pass + super().tearDownClass() + + def get_temp_path(self, *paths): + temp_dir = os.path.join(self.get_base_temp_dir(), self.id()) + path = os.path.join(temp_dir, *paths) + os.makedirs(os.path.dirname(path), exist_ok=True) + return path diff --git a/tests/unit/common_utils/parameterized_utils.py b/tests/unit/common_utils/parameterized_utils.py new file mode 100644 index 000000000..95af65c84 --- /dev/null +++ b/tests/unit/common_utils/parameterized_utils.py @@ -0,0 +1,50 @@ +import json +from itertools import product + +from parameterized import param, parameterized + +def get_asset_path(*paths): + """Return full path of a test asset""" + return os.path.join(_TEST_DIR_PATH, "assets", *paths) + +def load_params(*paths): + with open(get_asset_path(*paths), "r") as file: + return [param(json.loads(line)) for line in file] + +def _name_func(func, _, params): + strs = [] + for arg in params.args: + if isinstance(arg, tuple): + strs.append("_".join(str(a) for a in arg)) + else: + strs.append(str(arg)) + # sanitize the test name + name = "_".join(strs) + return parameterized.to_safe_name(f"{func.__name__}_{name}") + + +def nested_params(*params_set, name_func=_name_func): + """Generate the cartesian product of the given list of parameters. + + Args: + params_set (list of parameters): Parameters. When using ``parameterized.param`` class, + all the parameters have to be specified with the class, only using kwargs. + """ + flatten = [p for params in params_set for p in params] + + # Parameters to be nested are given as list of plain objects + if all(not isinstance(p, param) for p in flatten): + args = list(product(*params_set)) + return parameterized.expand(args, name_func=_name_func) + + # Parameters to be nested are given as list of `parameterized.param` + if not all(isinstance(p, param) for p in flatten): + raise TypeError("When using ``parameterized.param``, " "all the parameters have to be of the ``param`` type.") + if any(p.args for p in flatten): + raise ValueError( + "When using ``parameterized.param``, " "all the parameters have to be provided as keyword argument." + ) + args = [param()] + for params in params_set: + args = [param(**x.kwargs, **y.kwargs) for x in args for y in params] + return parameterized.expand(args) diff --git a/tests/unit/common_utils/sox_utils.py b/tests/unit/common_utils/sox_utils.py new file mode 100644 index 000000000..6ceae081e --- /dev/null +++ b/tests/unit/common_utils/sox_utils.py @@ -0,0 +1,116 @@ +import subprocess +import sys +import warnings + + +def get_encoding(dtype): + encodings = { + "float32": "floating-point", + "int32": "signed-integer", + "int16": "signed-integer", + "uint8": "unsigned-integer", + } + return encodings[dtype] + + +def get_bit_depth(dtype): + bit_depths = { + "float32": 32, + "int32": 32, + "int16": 16, + "uint8": 8, + } + return bit_depths[dtype] + + +def gen_audio_file( + path, + sample_rate, + num_channels, + *, + encoding=None, + bit_depth=None, + compression=None, + attenuation=None, + duration=1, + comment_file=None, +): + """Generate synthetic audio file with `sox` command.""" + if path.endswith(".wav"): + warnings.warn("Use get_wav_data and save_wav to generate wav file for accurate result.") + command = [ + "sox", + "-V3", # verbose + "--no-dither", # disable automatic dithering + "-R", + # -R is supposed to be repeatable, though the implementation looks suspicious + # and not setting the seed to a fixed value. + # https://fossies.org/dox/sox-14.4.2/sox_8c_source.html + # search "sox_globals.repeatable" + ] + if bit_depth is not None: + command += ["--bits", str(bit_depth)] + command += [ + "--rate", + str(sample_rate), + "--null", # no input + "--channels", + str(num_channels), + ] + if compression is not None: + command += ["--compression", str(compression)] + if bit_depth is not None: + command += ["--bits", str(bit_depth)] + if encoding is not None: + command += ["--encoding", str(encoding)] + if comment_file is not None: + command += ["--comment-file", str(comment_file)] + command += [ + str(path), + "synth", + str(duration), # synthesizes for the given duration [sec] + "sawtooth", + "1", + # saw tooth covers the both ends of value range, which is a good property for test. + # similar to linspace(-1., 1.) + # this introduces bigger boundary effect than sine when converted to mp3 + ] + if attenuation is not None: + command += ["vol", f"-{attenuation}dB"] + print(" ".join(command), file=sys.stderr) + subprocess.run(command, check=True) + + +def convert_audio_file(src_path, dst_path, *, encoding=None, bit_depth=None, compression=None): + """Convert audio file with `sox` command.""" + command = ["sox", "-V3", "--no-dither", "-R", str(src_path)] + if encoding is not None: + command += ["--encoding", str(encoding)] + if bit_depth is not None: + command += ["--bits", str(bit_depth)] + if compression is not None: + command += ["--compression", str(compression)] + command += [dst_path] + print(" ".join(command), file=sys.stderr) + subprocess.run(command, check=True) + + +def _flattern(effects): + if not effects: + return effects + if isinstance(effects[0], str): + return effects + return [item for sublist in effects for item in sublist] + + +def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None): + """Run sox effects""" + effect = _flattern(effect) + command = ["sox", "-V", "--no-dither", input_file] + if output_bitdepth: + command += ["--bits", str(output_bitdepth)] + command += [output_file] + effect + if output_sample_rate: + command += ["rate", str(output_sample_rate)] + print(" ".join(command)) + subprocess.run(command, check=True) From 22a5344bcfbd87dd4b274b09caecd8cf71a4c6e3 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Fri, 12 Aug 2022 16:49:24 +0800 Subject: [PATCH 08/11] fix save && effect test --- paddlespeech/audio/backends/sox_io_backend.py | 4 +- paddlespeech/audio/sox_effects/sox_effects.py | 29 +- paddlespeech/audio/src/pybind/pybind.cpp | 6 +- .../audio/src/pybind/sox/effects_chain.cpp | 1 - paddlespeech/audio/utils/sox_utils.py | 2 +- tests/unit/assets/sox_effect_test_args.jsonl | 78 ++++ tests/unit/audio/backends/sox_io/save_test.py | 2 - .../unit/audio/backends/sox_io/smoke_test.py | 183 +++++++++ .../audio/backends/sox_io/sox_effect_test.py | 346 ++++++++++++++++++ tests/unit/common_utils/__init__.py | 9 +- tests/unit/common_utils/case_utils.py | 3 + .../unit/common_utils/parameterized_utils.py | 27 +- tests/unit/common_utils/wav_utils.py | 10 + 13 files changed, 670 insertions(+), 30 deletions(-) create mode 100644 tests/unit/assets/sox_effect_test_args.jsonl create mode 100644 tests/unit/audio/backends/sox_io/smoke_test.py create mode 100644 tests/unit/audio/backends/sox_io/sox_effect_test.py diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index beb6ddb9d..2037ad81d 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -88,9 +88,9 @@ def save(filepath: str, ) @_mod_utils.requires_sox() -def info(filepath: str, format: Optional[str]) -> None: +def info(filepath: str, format: Optional[str] = "") -> None: if hasattr(filepath, "read"): - sinfo = paddleaudio.get_info_fileojb(filepath, format) + sinfo = paddleaudio.get_info_fileobj(filepath, format) if sinfo is not None: return AudioMetaData(*sinfo) return _fallback_info_fileobj(filepath, format) diff --git a/paddlespeech/audio/sox_effects/sox_effects.py b/paddlespeech/audio/sox_effects/sox_effects.py index 1a3f3af29..17d2d95af 100644 --- a/paddlespeech/audio/sox_effects/sox_effects.py +++ b/paddlespeech/audio/sox_effects/sox_effects.py @@ -1,5 +1,7 @@ import os from typing import List, Optional, Tuple +import paddle +import numpy from paddlespeech.audio._internal import module_utils as _mod_utils from paddlespeech.audio.utils.sox_utils import list_effects @@ -52,11 +54,11 @@ def effect_names() -> List[str]: @_mod_utils.requires_sox() def apply_effects_tensor( - tensor: torch.Tensor, + tensor: paddle.Tensor, sample_rate: int, effects: List[List[str]], channels_first: bool = True, -) -> Tuple[torch.Tensor, int]: +) -> Tuple[paddle.Tensor, int]: """Apply sox effects to given Tensor .. devices:: CPU @@ -152,7 +154,11 @@ def apply_effects_tensor( >>> waveform, sample_rate = transform(waveform, input_sample_rate) >>> assert sample_rate == 8000 """ - return paddleaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first) + tensor_np = tensor.numpy() + ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first) + if ret is not None: + return (paddle.to_tensor(ret[0]), ret[1]) + raise RuntimeError("Failed to apply sox effect") @_mod_utils.requires_sox() @@ -162,7 +168,7 @@ def apply_effects_file( normalize: bool = True, channels_first: bool = True, format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: +) -> Tuple[paddle.Tensor, int]: """Apply sox effects to the audio file and load the resulting data as Tensor .. devices:: CPU @@ -270,14 +276,13 @@ def apply_effects_file( >>> for batch in loader: >>> pass """ - if not torch.jit.is_scripting(): - if hasattr(path, "read"): - ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format) - if ret is None: - raise RuntimeError("Failed to load audio from {}".format(path)) - return ret - path = os.fspath(path) + if hasattr(path, "read"): + ret = paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format) + if ret is None: + raise RuntimeError("Failed to load audio from {}".format(path)) + return (paddle.to_tensor(ret[0]), ret[1]) + path = os.fspath(path) ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format) if ret is not None: - return ret + return (paddle.to_tensor(ret[0]), ret[1]) raise RuntimeError("Failed to load audio from {}".format(path)) \ No newline at end of file diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index 24cf0eb18..b265a2ab1 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -65,9 +65,9 @@ PYBIND11_MODULE(_paddleaudio, m) { &paddleaudio::sox_utils::get_buffer_size); // effect - //m.def("apply_effects_fileobj", - // &paddleaudio::sox_effects::apply_effects_fileobj, - // "Decode audio data from file-like obj and apply effects."); + m.def("apply_effects_fileobj", + &paddleaudio::sox_effects::apply_effects_fileobj, + "Decode audio data from file-like obj and apply effects."); m.def("sox_effects_initialize_sox_effects", &paddleaudio::sox_effects::initialize_sox_effects); m.def( diff --git a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp index 15fc6d26e..5e8f6ee71 100644 --- a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp +++ b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp @@ -59,7 +59,6 @@ int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { switch (tensor.dtype().num()) { //case c10::ScalarType::Float: { case 11: { - break; // Need to convert to 64-bit precision so that // values around INT32_MIN/MAX are handled correctly. for (int idx = 0; idx < chunk.size(); ++idx) { diff --git a/paddlespeech/audio/utils/sox_utils.py b/paddlespeech/audio/utils/sox_utils.py index fb19ff316..37696a5d9 100644 --- a/paddlespeech/audio/utils/sox_utils.py +++ b/paddlespeech/audio/utils/sox_utils.py @@ -31,7 +31,7 @@ def set_verbosity(verbosity: int): See Also: http://sox.sourceforge.net/sox.html """ - _paddleaudio.sox_utils_set_verbosity(verbosity) + _paddleaudio.sox_utils_set_verbosity(verbosity) @_mod_utils.requires_sox() diff --git a/tests/unit/assets/sox_effect_test_args.jsonl b/tests/unit/assets/sox_effect_test_args.jsonl new file mode 100644 index 000000000..b005515bb --- /dev/null +++ b/tests/unit/assets/sox_effect_test_args.jsonl @@ -0,0 +1,78 @@ +{"effects": [["allpass", "300", "10"]]} +{"effects": [["band", "300", "10"]]} +{"effects": [["bandpass", "300", "10"]]} +{"effects": [["bandreject", "300", "10"]]} +{"effects": [["bass", "-10"]]} +{"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]} +{"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]} +{"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]} +{"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]} +{"effects": [["channels", "1"]]} +{"effects": [["channels", "2"]]} +{"effects": [["channels", "3"]]} +{"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]} +{"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]} +{"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]} +{"effects": [["contrast", "0"]]} +{"effects": [["contrast", "25"]]} +{"effects": [["contrast", "50"]]} +{"effects": [["contrast", "75"]]} +{"effects": [["contrast", "100"]]} +{"effects": [["dcshift", "1.0"]]} +{"effects": [["dcshift", "-1.0"]]} +{"effects": [["deemph"]], "input_sample_rate": 44100} +{"effects": [["dither", "-s"]]} +{"effects": [["dither", "-S"]]} +{"effects": [["divide"]]} +{"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000} +{"effects": [["earwax"]], "input_sample_rate": 44100} +{"effects": [["echo", "0.8", "0.88", "60", "0.4"]]} +{"effects": [["echo", "0.8", "0.88", "6", "0.4"]]} +{"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]} +{"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]} +{"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]} +{"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]} +{"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]} +{"effects": [["equalizer", "300", "10", "5"]]} +{"effects": [["fade", "q", "3"]]} +{"effects": [["fade", "h", "3"]]} +{"effects": [["fade", "t", "3"]]} +{"effects": [["fade", "l", "3"]]} +{"effects": [["fade", "p", "3"]]} +{"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]} +{"effects": [["fir", "/sox_effect_test_fir_coeffs.txt"]]} +{"effects": [["flanger"]]} +{"effects": [["gain", "-l", "-6"]]} +{"effects": [["highpass", "-1", "300"]]} +{"effects": [["highpass", "-2", "300"]]} +{"effects": [["hilbert"]]} +{"effects": [["loudness"]]} +{"effects": [["lowpass", "-1", "300"]]} +{"effects": [["lowpass", "-2", "300"]]} +{"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100} +{"effects": [["oops"]]} +{"effects": [["overdrive"]]} +{"effects": [["pad"]]} +{"effects": [["phaser"]]} +{"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8} +{"effects": [["remix", "1-3,7", "3"]], "num_channels": 8} +{"effects": [["repeat"]]} +{"effects": [["reverb"]]} +{"effects": [["reverse"]]} +{"effects": [["riaa"]], "input_sample_rate": 44100} +{"effects": [["silence", "0"]]} +{"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200} +{"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800} +{"effects": [["stat"]]} +{"effects": [["stats"]]} +{"effects": [["stretch"]]} +{"effects": [["swap"]]} +{"effects": [["synth"]]} +{"effects": [["tempo", "0.9"]]} +{"effects": [["tempo", "1.1"]]} +{"effects": [["treble", "3"]]} +{"effects": [["tremolo", "300", "40"]]} +{"effects": [["tremolo", "300", "50"]]} +{"effects": [["trim", "0", "0.1"]]} +{"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000} +{"effects": [["vol", "3"]]} diff --git a/tests/unit/audio/backends/sox_io/save_test.py b/tests/unit/audio/backends/sox_io/save_test.py index 269c502a3..b07af70f2 100644 --- a/tests/unit/audio/backends/sox_io/save_test.py +++ b/tests/unit/audio/backends/sox_io/save_test.py @@ -164,8 +164,6 @@ class TestSave(TestSaveBase, unittest.TestCase): [ ("float32",), ("int32",), - ("int16",), - ("uint8",), ], ) def test_save_wav_dtype(self, test_mode, params): diff --git a/tests/unit/audio/backends/sox_io/smoke_test.py b/tests/unit/audio/backends/sox_io/smoke_test.py new file mode 100644 index 000000000..1f191bc51 --- /dev/null +++ b/tests/unit/audio/backends/sox_io/smoke_test.py @@ -0,0 +1,183 @@ +import io +import itertools +import unittest + +from parameterized import parameterized +from paddlespeech.audio.backends import sox_io_backend +from tests.unit.common_utils import ( + get_wav_data, + TempDirMixin, + name_func +) + +class SmokeTest(TempDirMixin, unittest.TestCase): + """Run smoke test on various audio format + + The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit + abnormal behaviors. + + This test suite should be able to run without any additional tools (such as sox command), + however without such tools, the correctness of each function cannot be verified. + """ + + def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype="float32"): + duration = 1 + num_frames = sample_rate * duration + #path = self.get_temp_path(f"test.{ext}") + path = self.get_temp_path(f"test.{ext}") + original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames) + + # 1. run save + sox_io_backend.save(path, original, sample_rate, compression=compression) + # 2. run info + info = sox_io_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_channels == num_channels + # 3. run load + loaded, sr = sox_io_backend.load(path, normalize=False) + assert sr == sample_rate + assert loaded.shape[0] == num_channels + + @parameterized.expand( + list( + itertools.product( + ["float32", "int32" ], + #["float32", "int32", "int16", "uint8"], + [8000, 16000], + [1, 2], + ) + ), + name_func=name_func, + ) + def test_wav(self, dtype, sample_rate, num_channels): + """Run smoke test on wav format""" + self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype) + + #@parameterized.expand( + #list( + #itertools.product( + #[8000, 16000], + #[1, 2], + #[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320], + #) + #) + #) + #def test_mp3(self, sample_rate, num_channels, bit_rate): + #"""Run smoke test on mp3 format""" + #self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate) + + #@parameterized.expand( + #list( + #itertools.product( + #[8000, 16000], + #[1, 2], + #[-1, 0, 1, 2, 3, 3.6, 5, 10], + #) + #) + #) + #def test_vorbis(self, sample_rate, num_channels, quality_level): + #"""Run smoke test on vorbis format""" + #self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level) + + @parameterized.expand( + list( + itertools.product( + [8000, 16000], + [1, 2], + list(range(9)), + ) + ), + name_func=name_func, + ) + def test_flac(self, sample_rate, num_channels, compression_level): + """Run smoke test on flac format""" + self.run_smoke_test("flac", sample_rate, num_channels, compression=compression_level) + + +class SmokeTestFileObj(unittest.TestCase): + """Run smoke test on various audio format + + The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit + abnormal behaviors. + + This test suite should be able to run without any additional tools (such as sox command), + however without such tools, the correctness of each function cannot be verified. + """ + + def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype="float32"): + duration = 1 + num_frames = sample_rate * duration + original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames) + + fileobj = io.BytesIO() + # 1. run save + sox_io_backend.save(fileobj, original, sample_rate, compression=compression, format=ext) + # 2. run info + fileobj.seek(0) + info = sox_io_backend.info(fileobj, format=ext) + assert info.sample_rate == sample_rate + assert info.num_channels == num_channels + # 3. run load + fileobj.seek(0) + loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext) + assert sr == sample_rate + assert loaded.shape[0] == num_channels + + @parameterized.expand( + list( + itertools.product( + ["float32", "int32"], + [8000, 16000], + [1, 2], + ) + ), + name_func=name_func, + ) + def test_wav(self, dtype, sample_rate, num_channels): + """Run smoke test on wav format""" + self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype) + + # not support yet + #@parameterized.expand( + #list( + #itertools.product( + #[8000, 16000], + #[1, 2], + #[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320], + #) + #) + #) + #def test_mp3(self, sample_rate, num_channels, bit_rate): + #"""Run smoke test on mp3 format""" + #self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate) + + #@parameterized.expand( + #list( + #itertools.product( + #[8000, 16000], + #[1, 2], + #[-1, 0, 1, 2, 3, 3.6, 5, 10], + #) + #) + #) + #def test_vorbis(self, sample_rate, num_channels, quality_level): + #"""Run smoke test on vorbis format""" + #self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level) + + @parameterized.expand( + list( + itertools.product( + [8000, 16000], + [1, 2], + list(range(9)), + ) + ), + name_func=name_func, + ) + def test_flac(self, sample_rate, num_channels, compression_level): + #"""Run smoke test on flac format""" + self.run_smoke_test("flac", sample_rate, num_channels, compression=compression_level) + +if __name__ == '__main__': + #test_func() + unittest.main() diff --git a/tests/unit/audio/backends/sox_io/sox_effect_test.py b/tests/unit/audio/backends/sox_io/sox_effect_test.py new file mode 100644 index 000000000..63c632ad1 --- /dev/null +++ b/tests/unit/audio/backends/sox_io/sox_effect_test.py @@ -0,0 +1,346 @@ +import io +import itertools +import tarfile +import unittest +from pathlib import Path +import numpy as np + +from parameterized import parameterized +from paddlespeech.audio import sox_effects +from paddlespeech.audio._internal import module_utils as _mod_utils +from tests.unit.common_utils import ( + get_sinusoid, + get_wav_data, + load_wav, + save_wav, + sox_utils, + TempDirMixin, + name_func, + load_effects_params +) + +if _mod_utils.is_module_available("requests"): + import requests + + +class TestSoxEffects(unittest.TestCase): + def test_init(self): + """Calling init_sox_effects multiple times does not crush""" + for _ in range(3): + sox_effects.init_sox_effects() + + +class TestSoxEffectsTensor(TempDirMixin, unittest.TestCase): + """Test suite for `apply_effects_tensor` function""" + + @parameterized.expand( + list(itertools.product(["float32", "int32"], [8000, 16000], [1, 2, 4, 8], [True, False])), + ) + def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first): + """`apply_effects_tensor` without effects should return identical data as input""" + original = get_wav_data(dtype, num_channels, channels_first=channels_first) + expected = original.clone() + + found, output_sample_rate = sox_effects.apply_effects_tensor(expected, sample_rate, [], channels_first) + + assert (output_sample_rate == sample_rate) + # SoxEffect should not alter the input Tensor object + #self.assertEqual(original, expected) + np.testing.assert_array_almost_equal(original.numpy(), expected.numpy()) + + # SoxEffect should not return the same Tensor object + assert expected is not found + # Returned Tensor should equal to the input Tensor + #self.assertEqual(expected, found) + np.testing.assert_array_almost_equal(expected.numpy(), found.numpy()) + + @parameterized.expand( + load_effects_params("sox_effect_test_args.jsonl"), + name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}', + ) + def test_apply_effects(self, args): + """`apply_effects_tensor` should return identical data as sox command""" + effects = args["effects"] + num_channels = args.get("num_channels", 2) + input_sr = args.get("input_sample_rate", 8000) + output_sr = args.get("output_sample_rate") + + input_path = self.get_temp_path("input.wav") + reference_path = self.get_temp_path("reference.wav") + + original = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype="float32") + save_wav(input_path, original, input_sr) + sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) + + expected, expected_sr = load_wav(reference_path) + found, sr = sox_effects.apply_effects_tensor(original, input_sr, effects) + + assert sr == expected_sr + #self.assertEqual(expected, found) + np.testing.assert_array_almost_equal(expected.numpy(), found.numpy()) + + +class TestSoxEffectsFile(TempDirMixin, unittest.TestCase): + """Test suite for `apply_effects_file` function""" + + @parameterized.expand( + list( + itertools.product( + ["float32", "int32"], + [8000, 16000], + [1, 2, 4, 8], + [False, True], + ) + ), + #name_func=name_func, + ) + def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first): + """`apply_effects_file` without effects should return identical data as input""" + path = self.get_temp_path("input.wav") + expected = get_wav_data(dtype, num_channels, channels_first=channels_first) + save_wav(path, expected, sample_rate, channels_first=channels_first) + + found, output_sample_rate = sox_effects.apply_effects_file( + path, [], normalize=False, channels_first=channels_first + ) + + assert output_sample_rate == sample_rate + #self.assertEqual(expected, found) + np.testing.assert_array_almost_equal(expected.numpy(), found.numpy()) + + @parameterized.expand( + load_effects_params("sox_effect_test_args.jsonl"), + #name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}', + ) + def test_apply_effects_str(self, args): + """`apply_effects_file` should return identical data as sox command""" + dtype = "int32" + channels_first = True + effects = args["effects"] + num_channels = args.get("num_channels", 2) + input_sr = args.get("input_sample_rate", 8000) + output_sr = args.get("output_sample_rate") + + input_path = self.get_temp_path("input.wav") + reference_path = self.get_temp_path("reference.wav") + data = get_wav_data(dtype, num_channels, channels_first=channels_first) + save_wav(input_path, data, input_sr, channels_first=channels_first) + sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) + + expected, expected_sr = load_wav(reference_path) + found, sr = sox_effects.apply_effects_file(input_path, effects, normalize=False, channels_first=channels_first) + + assert sr == expected_sr + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(expected.numpy(), found.numpy()) + + + def test_apply_effects_path(self): + """`apply_effects_file` should return identical data as sox command when file path is given as a Path Object""" + dtype = "int32" + channels_first = True + effects = [["hilbert"]] + num_channels = 2 + input_sr = 8000 + output_sr = 8000 + + input_path = self.get_temp_path("input.wav") + reference_path = self.get_temp_path("reference.wav") + data = get_wav_data(dtype, num_channels, channels_first=channels_first) + save_wav(input_path, data, input_sr, channels_first=channels_first) + sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) + + expected, expected_sr = load_wav(reference_path) + found, sr = sox_effects.apply_effects_file( + Path(input_path), effects, normalize=False, channels_first=channels_first + ) + + assert sr == expected_sr + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(expected.numpy(), found.numpy()) + + +class TestFileFormats(TempDirMixin, unittest.TestCase): + """`apply_effects_file` gives the same result as sox on various file formats""" + + @parameterized.expand( + list( + itertools.product( + ["float32", "int32"], + [8000, 16000], + [1, 2], + ) + ), + #name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}', + ) + def test_wav(self, dtype, sample_rate, num_channels): + """`apply_effects_file` works on various wav format""" + channels_first = True + effects = [["band", "300", "10"]] + + input_path = self.get_temp_path("input.wav") + reference_path = self.get_temp_path("reference.wav") + data = get_wav_data(dtype, num_channels, channels_first=channels_first) + save_wav(input_path, data, sample_rate, channels_first=channels_first) + sox_utils.run_sox_effect(input_path, reference_path, effects) + + expected, expected_sr = load_wav(reference_path) + found, sr = sox_effects.apply_effects_file(input_path, effects, normalize=False, channels_first=channels_first) + + assert sr == expected_sr + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + #not support now + #@parameterized.expand( + #list( + #itertools.product( + #[8000, 16000], + #[1, 2], + #) + #), + ##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}', + #) + #def test_flac(self, sample_rate, num_channels): + #"""`apply_effects_file` works on various flac format""" + #channels_first = True + #effects = [["band", "300", "10"]] + + #input_path = self.get_temp_path("input.flac") + #reference_path = self.get_temp_path("reference.wav") + #sox_utils.gen_audio_file(input_path, sample_rate, num_channels) + #sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) + + #expected, expected_sr = load_wav(reference_path) + #found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first) + #save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first) + + #assert sr == expected_sr + ##self.assertEqual(found, expected) + #np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + #@parameterized.expand( + #list( + #itertools.product( + #[8000, 16000], + #[1, 2], + #) + #), + ##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}', + #) + #def test_vorbis(self, sample_rate, num_channels): + #"""`apply_effects_file` works on various vorbis format""" + #channels_first = True + #effects = [["band", "300", "10"]] + + #input_path = self.get_temp_path("input.vorbis") + #reference_path = self.get_temp_path("reference.wav") + #sox_utils.gen_audio_file(input_path, sample_rate, num_channels) + #sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) + + #expected, expected_sr = load_wav(reference_path) + #found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first) + #save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first) + + #assert sr == expected_sr + ##self.assertEqual(found, expected) + #np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + +#@skipIfNoExec("sox") +#@skipIfNoSox +class TestFileObject(TempDirMixin, unittest.TestCase): + @parameterized.expand( + [ + ("wav", None), + ] + ) + def test_fileobj(self, ext, compression): + """Applying effects via file object works""" + sample_rate = 16000 + channels_first = True + effects = [["band", "300", "10"]] + input_path = self.get_temp_path(f"input.{ext}") + reference_path = self.get_temp_path("reference.wav") + + #sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression) + data = get_wav_data("int32", 2, channels_first=channels_first) + save_wav(input_path, data, sample_rate, channels_first=channels_first) + + sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) + expected, expected_sr = load_wav(reference_path) + + with open(input_path, "rb") as fileobj: + found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first) + save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first) + assert sr == expected_sr + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + @parameterized.expand( + [ + ("wav", None), + ] + ) + def test_bytesio(self, ext, compression): + """Applying effects via BytesIO object works""" + sample_rate = 16000 + channels_first = True + effects = [["band", "300", "10"]] + input_path = self.get_temp_path(f"input.{ext}") + reference_path = self.get_temp_path("reference.wav") + + #sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression) + data = get_wav_data("int32", 2, channels_first=channels_first) + save_wav(input_path, data, sample_rate, channels_first=channels_first) + sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) + expected, expected_sr = load_wav(reference_path) + + with open(input_path, "rb") as file_: + fileobj = io.BytesIO(file_.read()) + found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first) + save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first) + assert sr == expected_sr + #self.assertEqual(found, expected) + print("found") + print(found) + print("expected") + print(expected) + np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + @parameterized.expand( + [ + ("wav", None), + ] + ) + def test_tarfile(self, ext, compression): + """Applying effects to compressed audio via file-like file works""" + sample_rate = 16000 + channels_first = True + effects = [["band", "300", "10"]] + audio_file = f"input.{ext}" + + input_path = self.get_temp_path(audio_file) + reference_path = self.get_temp_path("reference.wav") + archive_path = self.get_temp_path("archive.tar.gz") + data = get_wav_data("int32", 2, channels_first=channels_first) + save_wav(input_path, data, sample_rate, channels_first=channels_first) + + # sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression) + sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) + + expected, expected_sr = load_wav(reference_path) + + with tarfile.TarFile(archive_path, "w") as tarobj: + tarobj.add(input_path, arcname=audio_file) + with tarfile.TarFile(archive_path, "r") as tarobj: + fileobj = tarobj.extractfile(audio_file) + found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first) + save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first) + assert sr == expected_sr + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/unit/common_utils/__init__.py b/tests/unit/common_utils/__init__.py index 722a9789f..7bc718f38 100644 --- a/tests/unit/common_utils/__init__.py +++ b/tests/unit/common_utils/__init__.py @@ -1,7 +1,9 @@ from .wav_utils import get_wav_data, load_wav, save_wav, normalize_wav -from .parameterized_utils import load_params, nested_params +from .parameterized_utils import nested_params +from .data_utils import get_sinusoid, load_params, load_effects_params from .case_utils import ( - TempDirMixin + TempDirMixin, + name_func ) __all__ = [ @@ -11,4 +13,7 @@ __all__ = [ "normalize_wav", "load_params", "nested_params", + "get_sinusoid", + "name_func", + "load_effects_params" ] diff --git a/tests/unit/common_utils/case_utils.py b/tests/unit/common_utils/case_utils.py index cee2f29c8..6f4326f56 100644 --- a/tests/unit/common_utils/case_utils.py +++ b/tests/unit/common_utils/case_utils.py @@ -14,6 +14,9 @@ from paddlespeech.audio._internal.module_utils import ( is_sox_available, ) +def name_func(func, _, params): + return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}' + class TempDirMixin: """Mixin to provide easy access to temp dir""" diff --git a/tests/unit/common_utils/parameterized_utils.py b/tests/unit/common_utils/parameterized_utils.py index 95af65c84..46cef3127 100644 --- a/tests/unit/common_utils/parameterized_utils.py +++ b/tests/unit/common_utils/parameterized_utils.py @@ -1,15 +1,28 @@ import json from itertools import product +import os from parameterized import param, parameterized -def get_asset_path(*paths): - """Return full path of a test asset""" - return os.path.join(_TEST_DIR_PATH, "assets", *paths) - -def load_params(*paths): - with open(get_asset_path(*paths), "r") as file: - return [param(json.loads(line)) for line in file] +#def get_asset_path(*paths): + #"""Return full path of a test asset""" + #return os.path.join(_TEST_DIR_PATH, "assets", *paths) + +#def load_params(*paths): + #with open(get_asset_path(*paths), "r") as file: + #return [param(json.loads(line)) for line in file] + +#def load_effects_params(*paths): + #params = [] + #with open(get_asset_path(*paths), "r") as file: + #for line in file: + #data = json.loads(line) + #for effect in data["effects"]: + #for i, arg in enumerate(effect): + #if arg.startswith(""): + #effect[i] = arg.replace("", get_asset_path()) + #params.append(param(data)) + #return params def _name_func(func, _, params): strs = [] diff --git a/tests/unit/common_utils/wav_utils.py b/tests/unit/common_utils/wav_utils.py index dbdd453e0..25d0b1971 100644 --- a/tests/unit/common_utils/wav_utils.py +++ b/tests/unit/common_utils/wav_utils.py @@ -2,6 +2,7 @@ from typing import Optional import scipy.io.wavfile import paddle +import numpy as np def normalize_wav(tensor: paddle.Tensor) -> paddle.Tensor: if tensor.dtype == paddle.float32: @@ -52,8 +53,14 @@ def get_wav_data( # paddle linspace not support uint8, int8, int16 #if dtype == "uint8": # base = paddle.linspace(0, 255, num_frames, dtype=dtype_) + #dtype_np = getattr(np, dtype) + #base_np = np.linspace(0, 255, num_frames, dtype_np) + #base = paddle.to_tensor(base_np, dtype=dtype_) #elif dtype == "int8": # base = paddle.linspace(-128, 127, num_frames, dtype=dtype_) + #dtype_np = getattr(np, dtype) + #base_np = np.linspace(-128, 127, num_frames, dtype_np) + #base = paddle.to_tensor(base_np, dtype=dtype_) if dtype == "float32": base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) elif dtype == "float64": @@ -62,6 +69,9 @@ def get_wav_data( base = paddle.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) #elif dtype == "int16": # base = paddle.linspace(-32768, 32767, num_frames, dtype=dtype_) + #dtype_np = getattr(np, dtype) + #base_np = np.linspace(-32768, 32767, num_frames, dtype_np) + #base = paddle.to_tensor(base_np, dtype=dtype_) else: raise NotImplementedError(f"Unsupported dtype {dtype}") data = base.tile([num_channels, 1]) From 59edc643694529ba9ebc3402225a52ab3ebc5e15 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Mon, 15 Aug 2022 14:51:55 +0800 Subject: [PATCH 09/11] add copyrigh --- paddlespeech/audio/sox_effects/sox_effects.py | 28 +++++++++---------- .../audio/backends/sox_io/sox_effect_test.py | 1 + 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/paddlespeech/audio/sox_effects/sox_effects.py b/paddlespeech/audio/sox_effects/sox_effects.py index 17d2d95af..a984d2925 100644 --- a/paddlespeech/audio/sox_effects/sox_effects.py +++ b/paddlespeech/audio/sox_effects/sox_effects.py @@ -63,8 +63,6 @@ def apply_effects_tensor( .. devices:: CPU - .. properties:: TorchScript - Note: This function only works on CPU Tensors. This function works in the way very similar to ``sox`` command, however there are slight @@ -74,7 +72,7 @@ def apply_effects_tensor( need to give ``rate`` effect with desired sampling rate.). Args: - tensor (torch.Tensor): Input 2D CPU Tensor. + tensor (paddle.Tensor): Input 2D CPU Tensor. sample_rate (int): Sample rate effects (List[List[str]]): List of effects. channels_first (bool, optional): Indicates if the input Tensor's dimension is @@ -98,9 +96,9 @@ def apply_effects_tensor( >>> # Generate pseudo wave: >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second >>> sample_rate = 16000 - >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1 + >>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1 >>> waveform.shape - torch.Size([2, 16000]) + paddle.Size([2, 16000]) >>> waveform tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442], [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]]) @@ -113,7 +111,7 @@ def apply_effects_tensor( >>> # The new waveform is sampling rate 8000, 1 second. >>> # normalization and channel order are preserved >>> waveform.shape - torch.Size([2, 8000]) + paddle.Size([2, 8000]) >>> waveform tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110], [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]]) @@ -122,17 +120,17 @@ def apply_effects_tensor( Example - Torchscript-able transform >>> - >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file, + >>> # Use `apply_effects_tensor` in `paddle.nn.Module` and dump it to file, >>> # then run sox effect via Torchscript runtime. >>> - >>> class SoxEffectTransform(torch.nn.Module): + >>> class SoxEffectTransform(paddle.nn.Module): ... effects: List[List[str]] ... ... def __init__(self, effects: List[List[str]]): ... super().__init__() ... self.effects = effects ... - ... def forward(self, tensor: torch.Tensor, sample_rate: int): + ... def forward(self, tensor: paddle.Tensor, sample_rate: int): ... return sox_effects.apply_effects_tensor( ... tensor, sample_rate, self.effects) ... @@ -146,8 +144,8 @@ def apply_effects_tensor( >>> >>> # Dump it to file and load >>> path = 'sox_effect.zip' - >>> torch.jit.script(trans).save(path) - >>> transform = torch.jit.load(path) + >>> paddle.jit.script(trans).save(path) + >>> transform = paddle.jit.load(path) >>> >>>> # Run transform >>> waveform, input_sample_rate = paddleaudio.load("input.wav") @@ -186,7 +184,7 @@ def apply_effects_file( Args: path (path-like object or file-like object): Source of audio data. When the function is not compiled by TorchScript, - (e.g. ``torch.jit.script``), the following types are accepted: + (e.g. ``paddle.jit.script``), the following types are accepted: * ``path-like``: file path * ``file-like``: Object with ``read(size: int) -> bytes`` method, @@ -232,7 +230,7 @@ def apply_effects_file( >>> >>> # Check the result >>> waveform.shape - torch.Size([2, 8000]) + paddle.Size([2, 8000]) >>> waveform tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07, -1.4761e-07, 1.8114e-07], @@ -244,7 +242,7 @@ def apply_effects_file( Example - Apply random speed perturbation to dataset >>> >>> # Load data from file, apply random speed perturbation - >>> class RandomPerturbationFile(torch.utils.data.Dataset): + >>> class RandomPerturbationFile(paddle.utils.data.Dataset): ... \"\"\"Given flist, apply random speed perturbation ... ... Suppose all the input files are at least one second long. @@ -272,7 +270,7 @@ def apply_effects_file( ... return len(self.flist) ... >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000) - >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32) + >>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32) >>> for batch in loader: >>> pass """ diff --git a/tests/unit/audio/backends/sox_io/sox_effect_test.py b/tests/unit/audio/backends/sox_io/sox_effect_test.py index 63c632ad1..d9c70bc5e 100644 --- a/tests/unit/audio/backends/sox_io/sox_effect_test.py +++ b/tests/unit/audio/backends/sox_io/sox_effect_test.py @@ -1,3 +1,4 @@ +#code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/sox_effect/sox_effect_test.py import io import itertools import tarfile From 7abfad804ec79f811f78a9a0ce46db18394782be Mon Sep 17 00:00:00 2001 From: YangZhou Date: Mon, 15 Aug 2022 16:06:17 +0800 Subject: [PATCH 10/11] fix typo --- paddlespeech/audio/sox_effects/sox_effects.py | 48 ------------------- paddlespeech/audio/src/pybind/sox/io.cpp | 4 ++ paddlespeech/audio/src/pybind/sox/types.h | 2 +- tests/unit/audio/backends/sox_io/save_test.py | 6 +-- tests/unit/common_utils/case_utils.py | 6 ++- 5 files changed, 12 insertions(+), 54 deletions(-) diff --git a/paddlespeech/audio/sox_effects/sox_effects.py b/paddlespeech/audio/sox_effects/sox_effects.py index a984d2925..e9b839c1a 100644 --- a/paddlespeech/audio/sox_effects/sox_effects.py +++ b/paddlespeech/audio/sox_effects/sox_effects.py @@ -118,39 +118,6 @@ def apply_effects_tensor( >>> sample_rate 8000 - Example - Torchscript-able transform - >>> - >>> # Use `apply_effects_tensor` in `paddle.nn.Module` and dump it to file, - >>> # then run sox effect via Torchscript runtime. - >>> - >>> class SoxEffectTransform(paddle.nn.Module): - ... effects: List[List[str]] - ... - ... def __init__(self, effects: List[List[str]]): - ... super().__init__() - ... self.effects = effects - ... - ... def forward(self, tensor: paddle.Tensor, sample_rate: int): - ... return sox_effects.apply_effects_tensor( - ... tensor, sample_rate, self.effects) - ... - ... - >>> # Create transform object - >>> effects = [ - ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter - ... ["rate", "8000"], # change sample rate to 8000 - ... ] - >>> transform = SoxEffectTensorTransform(effects, input_sample_rate) - >>> - >>> # Dump it to file and load - >>> path = 'sox_effect.zip' - >>> paddle.jit.script(trans).save(path) - >>> transform = paddle.jit.load(path) - >>> - >>>> # Run transform - >>> waveform, input_sample_rate = paddleaudio.load("input.wav") - >>> waveform, sample_rate = transform(waveform, input_sample_rate) - >>> assert sample_rate == 8000 """ tensor_np = tensor.numpy() ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first) @@ -169,10 +136,6 @@ def apply_effects_file( ) -> Tuple[paddle.Tensor, int]: """Apply sox effects to the audio file and load the resulting data as Tensor - .. devices:: CPU - - .. properties:: TorchScript - Note: This function works in the way very similar to ``sox`` command, however there are slight differences. For example, ``sox`` commnad adds certain effects automatically (such as @@ -183,17 +146,6 @@ def apply_effects_file( Args: path (path-like object or file-like object): - Source of audio data. When the function is not compiled by TorchScript, - (e.g. ``paddle.jit.script``), the following types are accepted: - - * ``path-like``: file path - * ``file-like``: Object with ``read(size: int) -> bytes`` method, - which returns byte string of at most ``size`` length. - - When the function is compiled by TorchScript, only ``str`` type is allowed. - - Note: This argument is intentionally annotated as ``str`` only for - TorchScript compiler compatibility. effects (List[List[str]]): List of effects. normalize (bool, optional): When ``True``, this function always return ``float32``, and sample values are diff --git a/paddlespeech/audio/src/pybind/sox/io.cpp b/paddlespeech/audio/src/pybind/sox/io.cpp index 4c27e6aab..78b8af991 100644 --- a/paddlespeech/audio/src/pybind/sox/io.cpp +++ b/paddlespeech/audio/src/pybind/sox/io.cpp @@ -136,12 +136,16 @@ void save_audio_file(const std::string& path, const auto num_channels = tensor.shape(channels_first ? 0 : 1); //TORCH_CHECK(num_channels == 1, // "amr-nb format only supports single channel audio."); + assert(num_channels == 1); } else if (filetype == "htk") { const auto num_channels = tensor.shape(channels_first ? 0 : 1); // TORCH_CHECK(num_channels == 1, // "htk format only supports single channel audio."); + assert(num_channels == 1); } else if (filetype == "gsm") { const auto num_channels = tensor.shape(channels_first ? 0 : 1); + assert(num_channels == 1); + assert(sample_rate == 8000); //TORCH_CHECK(num_channels == 1, // "gsm format only supports single channel audio."); //TORCH_CHECK(sample_rate == 8000, diff --git a/paddlespeech/audio/src/pybind/sox/types.h b/paddlespeech/audio/src/pybind/sox/types.h index 824c0f632..780840161 100644 --- a/paddlespeech/audio/src/pybind/sox/types.h +++ b/paddlespeech/audio/src/pybind/sox/types.h @@ -55,4 +55,4 @@ BitDepth get_bit_depth_from_option(const tl::optional bit_depth); std::string get_encoding(sox_encoding_t encoding); } // namespace sox_utils -} // namespace torchaudio \ No newline at end of file +} // namespace paddleaudio \ No newline at end of file diff --git a/tests/unit/audio/backends/sox_io/save_test.py b/tests/unit/audio/backends/sox_io/save_test.py index b07af70f2..7942f018d 100644 --- a/tests/unit/audio/backends/sox_io/save_test.py +++ b/tests/unit/audio/backends/sox_io/save_test.py @@ -64,7 +64,7 @@ class TestSaveBase(TempDirMixin): | | | 2.1. load with scipy | 3.1. Convert to the target | then save it into the target | format depth with sox - | format with torchaudio | + | format with paddleaudio | v v target format target format | | @@ -83,7 +83,7 @@ class TestSaveBase(TempDirMixin): cmp_bit_depth = 32 src_path = self.get_temp_path("1.source.wav") - tgt_path = self.get_temp_path(f"2.1.torchaudio.{format}") + tgt_path = self.get_temp_path(f"2.1.paddleaudio.{format}") tst_path = self.get_temp_path("2.2.result.wav") sox_path = self.get_temp_path(f"3.1.sox.{format}") ref_path = self.get_temp_path("3.2.ref.wav") @@ -92,7 +92,7 @@ class TestSaveBase(TempDirMixin): data = get_wav_data(src_dtype, num_channels, normalize=False, num_frames=num_frames) save_wav(src_path, data, sample_rate) - # 2.1. Convert the original wav to target format with torchaudio + # 2.1. Convert the original wav to target format with paddleaudio data = load_wav(src_path, normalize=False)[0] if test_mode == "path": sox_io_backend.save( diff --git a/tests/unit/common_utils/case_utils.py b/tests/unit/common_utils/case_utils.py index 6f4326f56..406d293b6 100644 --- a/tests/unit/common_utils/case_utils.py +++ b/tests/unit/common_utils/case_utils.py @@ -7,6 +7,8 @@ import tempfile import time import unittest +#code is from:https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/common_utils/case_utils.py + import paddle from paddlespeech.audio._internal.module_utils import ( is_kaldi_available, @@ -24,9 +26,9 @@ class TempDirMixin: @classmethod def get_base_temp_dir(cls): - # If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory. + # If PADDLEAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory. # this is handy for debugging. - key = "TORCHAUDIO_TEST_TEMP_DIR" + key = "PADDLEAUDIO_TEST_TEMP_DIR" if key in os.environ: return os.environ[key] if cls.temp_dir_ is None: From a55592f7c2199be2fb9e20af0a6254a5fcf32342 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Mon, 15 Aug 2022 19:57:29 +0800 Subject: [PATCH 11/11] modify info test --- paddlespeech/audio/backends/sox_io_backend.py | 2 +- paddlespeech/audio/src/pybind/sox/io.cpp | 10 +- paddlespeech/audio/src/pybind/sox/io.h | 6 +- tests/unit/audio/backends/sox_io/info_test.py | 300 ++++++++++++++++-- tests/unit/audio/backends/sox_io/testdata | 1 - 5 files changed, 289 insertions(+), 30 deletions(-) delete mode 120000 tests/unit/audio/backends/sox_io/testdata diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index 2037ad81d..fff9e2069 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -88,7 +88,7 @@ def save(filepath: str, ) @_mod_utils.requires_sox() -def info(filepath: str, format: Optional[str] = "") -> None: +def info(filepath: str, format: Optional[str] = None,) -> AudioMetaData: if hasattr(filepath, "read"): sinfo = paddleaudio.get_info_fileobj(filepath, format) if sinfo is not None: diff --git a/paddlespeech/audio/src/pybind/sox/io.cpp b/paddlespeech/audio/src/pybind/sox/io.cpp index 78b8af991..60f9222ab 100644 --- a/paddlespeech/audio/src/pybind/sox/io.cpp +++ b/paddlespeech/audio/src/pybind/sox/io.cpp @@ -13,13 +13,14 @@ using namespace paddleaudio::sox_utils; namespace paddleaudio { namespace sox_io { -auto get_info_file(const std::string &path, const std::string &format) +auto get_info_file(const std::string &path, + const tl::optional &format) -> std::tuple { SoxFormat sf( sox_open_read(path.data(), /*signal=*/nullptr, /*encoding=*/nullptr, - /*filetype=*/format.empty() ? nullptr : format.data())); + /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); validate_input_file(sf, path); @@ -61,7 +62,8 @@ std::vector> get_effects( return effects; } -auto get_info_fileobj(py::object fileobj, const std::string &format) +auto get_info_fileobj(py::object fileobj, + const tl::optional &format) -> std::tuple { const auto capacity = [&]() { const auto bufsiz = get_buffer_size(); @@ -80,7 +82,7 @@ auto get_info_fileobj(py::object fileobj, const std::string &format) buf_size, /*signal=*/nullptr, /*encoding=*/nullptr, - /*filetype=*/format.empty() ? nullptr : format.data())); + /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); // In case of streamed data, length can be 0 validate_input_memfile(sf); diff --git a/paddlespeech/audio/src/pybind/sox/io.h b/paddlespeech/audio/src/pybind/sox/io.h index 94ce18f22..3734bcb34 100644 --- a/paddlespeech/audio/src/pybind/sox/io.h +++ b/paddlespeech/audio/src/pybind/sox/io.h @@ -10,10 +10,12 @@ namespace py = pybind11; namespace paddleaudio { namespace sox_io { -auto get_info_file(const std::string &path, const std::string &format) +auto get_info_file(const std::string &path, + const tl::optional &format) -> std::tuple; -auto get_info_fileobj(py::object fileobj, const std::string &format) +auto get_info_fileobj(py::object fileobj, + const tl::optional &format) -> std::tuple; tl::optional> load_audio_fileobj( diff --git a/tests/unit/audio/backends/sox_io/info_test.py b/tests/unit/audio/backends/sox_io/info_test.py index ae18a29ef..06aa54d25 100644 --- a/tests/unit/audio/backends/sox_io/info_test.py +++ b/tests/unit/audio/backends/sox_io/info_test.py @@ -1,34 +1,290 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. import unittest +import itertools +import tarfile +from contextlib import contextmanager import numpy as np import paddle +import os +import io +from parameterized import parameterized from paddlespeech.audio.backends import sox_io_backend -class TestInfo(unittest.TestCase): +from tests.unit.common_utils import ( + get_wav_data, + load_wav, + save_wav, + TempDirMixin, + sox_utils, + data_utils +) - def test_wav(self, dtype, sample_rate, num_channels, sample_size): - """check wav file correctly """ - path = 'testdata/test.wav' - info = sox_io_backend.get_info_file(path) +from common import get_encoding, get_bits_per_sample + +#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/info_test.py + +class TestInfo(TempDirMixin, unittest.TestCase): + @parameterized.expand( + list( + itertools.product( + ["float32", "int32",], + [8000, 16000], + [1, 2], + ) + ), + ) + def test_wav(self, dtype, sample_rate, num_channels): + """`sox_io_backend.info` can check wav file correctly""" + duration = 1 + path = self.get_temp_path("data.wav") + data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate) + save_wav(path, data, sample_rate) + info = sox_io_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == sox_utils.get_bit_depth(dtype) + assert info.encoding == get_encoding("wav", dtype) + + @parameterized.expand( + list( + itertools.product( + ["float32", "int32"], + [8000, 16000], + [4, 8, 16, 32], + ) + ), + ) + def test_wav_multiple_channels(self, dtype, sample_rate, num_channels): + """`sox_io_backend.info` can check wav file with channels more than 2 correctly""" + duration = 1 + path = self.get_temp_path("data.wav") + data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate) + save_wav(path, data, sample_rate) + info = sox_io_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == sox_utils.get_bit_depth(dtype) + + def test_ulaw(self): + """`sox_io_backend.info` can check ulaw file correctly""" + duration = 1 + num_channels = 1 + sample_rate = 8000 + path = self.get_temp_path("data.wav") + sox_utils.gen_audio_file( + path, sample_rate=sample_rate, num_channels=num_channels, bit_depth=8, encoding="u-law", duration=duration + ) + info = sox_io_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == 8 + assert info.encoding == "ULAW" + + def test_alaw(self): + """`sox_io_backend.info` can check alaw file correctly""" + duration = 1 + num_channels = 1 + sample_rate = 8000 + path = self.get_temp_path("data.wav") + sox_utils.gen_audio_file( + path, sample_rate=sample_rate, num_channels=num_channels, bit_depth=8, encoding="a-law", duration=duration + ) + info = sox_io_backend.info(path) assert info.sample_rate == sample_rate - assert info.num_frames == sample_size # duration*sample_rate + assert info.num_frames == sample_rate * duration assert info.num_channels == num_channels - assert info.bits_per_sample == get_bit_depth(dtype) - assert info.encoding == get_encoding('wav', dtype) - + assert info.bits_per_sample == 8 + assert info.encoding == "ALAW" + +#class TestInfoOpus(unittest.TestCase): + #@parameterized.expand( + #list( + #itertools.product( + #["96k"], + #[1, 2], + #[0, 5, 10], + #) + #), + #) + #def test_opus(self, bitrate, num_channels, compression_level): + #"""`sox_io_backend.info` can check opus file correcty""" + #path = data_utils.get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus") + #info = sox_io_backend.info(path) + #assert info.sample_rate == 48000 + #assert info.num_frames == 32768 + #assert info.num_channels == num_channels + #assert info.bits_per_sample == 0 # bit_per_sample is irrelevant for compressed formats + #assert info.encoding == "OPUS" + +class FileObjTestBase(TempDirMixin): + def _gen_file(self, ext, dtype, sample_rate, num_channels, num_frames, *, comments=None): + path = self.get_temp_path(f"test.{ext}") + bit_depth = sox_utils.get_bit_depth(dtype) + duration = num_frames / sample_rate + comment_file = self._gen_comment_file(comments) if comments else None + + sox_utils.gen_audio_file( + path, + sample_rate, + num_channels=num_channels, + encoding=sox_utils.get_encoding(dtype), + bit_depth=bit_depth, + duration=duration, + comment_file=comment_file, + ) + return path + + def _gen_comment_file(self, comments): + comment_path = self.get_temp_path("comment.txt") + with open(comment_path, "w") as file_: + file_.writelines(comments) + return comment_path + +class Unseekable: + def __init__(self, fileobj): + self.fileobj = fileobj + + def read(self, n): + return self.fileobj.read(n) + +class TestFileObject(FileObjTestBase, unittest.TestCase): + def _query_fileobj(self, ext, dtype, sample_rate, num_channels, num_frames, *, comments=None): + path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames, comments=comments) + format_ = ext if ext in ["mp3"] else None + with open(path, "rb") as fileobj: + return sox_io_backend.info(fileobj, format_) + + def _query_bytesio(self, ext, dtype, sample_rate, num_channels, num_frames): + path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames) + format_ = ext if ext in ["mp3"] else None + with open(path, "rb") as file_: + fileobj = io.BytesIO(file_.read()) + return sox_io_backend.info(fileobj, format_) + + def _query_tarfile(self, ext, dtype, sample_rate, num_channels, num_frames): + audio_path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames) + audio_file = os.path.basename(audio_path) + archive_path = self.get_temp_path("archive.tar.gz") + with tarfile.TarFile(archive_path, "w") as tarobj: + tarobj.add(audio_path, arcname=audio_file) + format_ = ext if ext in ["mp3"] else None + with tarfile.TarFile(archive_path, "r") as tarobj: + fileobj = tarobj.extractfile(audio_file) + return sox_io_backend.info(fileobj, format_) + + @contextmanager + def _set_buffer_size(self, buffer_size): + try: + original_buffer_size = get_buffer_size() + set_buffer_size(buffer_size) + yield + finally: + set_buffer_size(original_buffer_size) + + @parameterized.expand( + [ + ("wav", "float32"), + ("wav", "int32"), + ("wav", "int16"), + ("wav", "uint8"), + ] + ) + def test_fileobj(self, ext, dtype): + """Querying audio via file object works""" + sample_rate = 16000 + num_frames = 3 * sample_rate + num_channels = 2 + sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels, num_frames) + + bits_per_sample = get_bits_per_sample(ext, dtype) + num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames + + assert sinfo.sample_rate == sample_rate + assert sinfo.num_channels == num_channels + assert sinfo.num_frames == num_frames + assert sinfo.bits_per_sample == bits_per_sample + assert sinfo.encoding == get_encoding(ext, dtype) + + @parameterized.expand( + [ + ("wav", "float32"), + ("wav", "int32"), + ("wav", "int16"), + ("wav", "uint8"), + ] + ) + def test_bytesio(self, ext, dtype): + """Querying audio via ByteIO object works for small data""" + sample_rate = 16000 + num_frames = 3 * sample_rate + num_channels = 2 + sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels, num_frames) + + bits_per_sample = get_bits_per_sample(ext, dtype) + num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames + + assert sinfo.sample_rate == sample_rate + assert sinfo.num_channels == num_channels + assert sinfo.num_frames == num_frames + assert sinfo.bits_per_sample == bits_per_sample + assert sinfo.encoding == get_encoding(ext, dtype) + + @parameterized.expand( + [ + ("wav", "float32"), + ("wav", "int32"), + ("wav", "int16"), + ("wav", "uint8"), + ] + ) + def test_bytesio_tiny(self, ext, dtype): + """Querying audio via ByteIO object works for small data""" + sample_rate = 8000 + num_frames = 4 + num_channels = 2 + sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels, num_frames) + + bits_per_sample = get_bits_per_sample(ext, dtype) + num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames + + assert sinfo.sample_rate == sample_rate + assert sinfo.num_channels == num_channels + assert sinfo.num_frames == num_frames + assert sinfo.bits_per_sample == bits_per_sample + assert sinfo.encoding == get_encoding(ext, dtype) + + @parameterized.expand( + [ + ("wav", "float32"), + ("wav", "int32"), + ("wav", "int16"), + ("wav", "uint8"), + ("flac", "float32"), + ("vorbis", "float32"), + ("amb", "int16"), + ] + ) + def test_tarfile(self, ext, dtype): + """Querying compressed audio via file-like object works""" + sample_rate = 16000 + num_frames = 3.0 * sample_rate + num_channels = 2 + sinfo = self._query_tarfile(ext, dtype, sample_rate, num_channels, num_frames) + + bits_per_sample = get_bits_per_sample(ext, dtype) + num_frames = 0 if ext in ["vorbis"] else num_frames + + assert sinfo.sample_rate == sample_rate + assert sinfo.num_channels == num_channels + assert sinfo.num_frames == num_frames + assert sinfo.bits_per_sample == bits_per_sample + assert sinfo.encoding == get_encoding(ext, dtype) + + + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/sox_io/testdata b/tests/unit/audio/backends/sox_io/testdata deleted file mode 120000 index 485a3dd63..000000000 --- a/tests/unit/audio/backends/sox_io/testdata +++ /dev/null @@ -1 +0,0 @@ -../../features/testdata \ No newline at end of file