[engine] rename speechx (#2892)

* rename speechx

* fix wfst decode error

* replace reset with make_unique
pull/2923/head
YangZhou 1 year ago committed by GitHub
parent 21183d48b6
commit 8e1b4cd513
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8,7 +8,7 @@ repos:
entry: yapf
args: [-i, -vv]
types: [python]
exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: a11d9314b22d8f8c7556443875b731ef05965464
@ -35,7 +35,7 @@ repos:
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- --builtins=G,request
- --jobs=1
exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1
@ -57,16 +57,16 @@ repos:
entry: bash .pre-commit-hooks/clang-format.hook -i
language: system
files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders|speechx/speechx/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$
- id: cpplint
name: cpplint
description: Static code analysis of C/C++ files
language: python
files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders|speechx/speechx/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$
exclude: (?=runtime/engine/kaldi|runtime/engine/common/matrix|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$
entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
- repo: https://github.com/asottile/reorder_python_imports
rev: v2.4.0
hooks:
- id: reorder-python-imports
exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$

@ -164,7 +164,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), Support ASR and Feature Extraction.
- 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech).
- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/runtime/examples/u2pp_ol/wenetspeech).
- 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
- 🔥 2022.10.26: Add [Prosody Prediction](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy) for TTS.
- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.

@ -1,2 +1,3 @@
tools/valgrind*
*log
fc_patch/*

@ -93,7 +93,7 @@ endif()
# paddle libpaddle.so
# paddle include and link option
# -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
# -L/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
execute_process(
COMMAND python -c "\
import os;\
@ -112,7 +112,7 @@ message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
# paddle compile option
# -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include
# -I/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/include
execute_process(
COMMAND python -c "\
import paddle; \
@ -143,6 +143,6 @@ message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
###############################################################################
# Add local library
###############################################################################
set(SPEECHX_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/speechx)
set(ENGINE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/engine)
add_subdirectory(speechx)
add_subdirectory(engine)

@ -1,4 +1,3 @@
# SpeechX -- All in One Speech Task Inference
## Environment
@ -9,7 +8,7 @@ We develop under:
* gcc/g++/gfortran - 8.2.0
* cmake - 3.16.0
> Please use `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx.
> Please use `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build engine.
> We make sure all things work fun under docker, and recommend using it to develop and deploy.
@ -33,7 +32,7 @@ docker run --privileged --net=host --ipc=host -it --rm -v /path/to/paddlespeech
bash tools/venv.sh
```
2. Build `speechx` and `examples`.
2. Build `engine` and `examples`.
For now we are using feature under `develop` branch of paddle, so we need to install `paddlepaddle` nightly build version.
For example:

@ -2,10 +2,10 @@ include(FetchContent)
FetchContent_Declare(
gflags
URL https://github.com/gflags/gflags/archive/v2.2.2.zip
URL https://paddleaudio.bj.bcebos.com/build/gflag-2.2.2.zip
URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5
)
FetchContent_MakeAvailable(gflags)
# openfst need
include_directories(${gflags_BINARY_DIR}/include)
include_directories(${gflags_BINARY_DIR}/include)

@ -1,7 +1,7 @@
include(FetchContent)
FetchContent_Declare(
glog
URL https://github.com/google/glog/archive/v0.4.0.zip
URL https://paddleaudio.bj.bcebos.com/build/glog-0.4.0.zip
URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
)
FetchContent_MakeAvailable(glog)

@ -2,7 +2,7 @@
include(FetchContent)
FetchContent_Declare(
gtest
URL https://github.com/google/googletest/archive/release-1.11.0.zip
URL https://paddleaudio.bj.bcebos.com/build/gtest-release-1.11.0.zip
URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a
)
FetchContent_MakeAvailable(gtest)
@ -12,4 +12,4 @@ include_directories(${gtest_BINARY_DIR} ${gtest_SOURCE_DIR}/src)
if(WITH_TESTING)
enable_testing()
endif()
endif()

@ -1,8 +1,8 @@
include(FetchContent)
set(openfst_PREFIX_DIR ${fc_patch}/openfst)
set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
include(FetchContent)
# openfst Acknowledgments:
#Cyril Allauzen, Michael Riley, Johan Schalkwyk, Wojciech Skut and Mehryar Mohri,
#"OpenFst: A General and Efficient Weighted Finite-State Transducer Library",
@ -25,5 +25,7 @@ ExternalProject_Add(openfst
)
link_directories(${openfst_PREFIX_DIR}/lib)
include_directories(${openfst_PREFIX_DIR}/include)
message(STATUS "OpenFST inc dir: ${openfst_PREFIX_DIR}/include")
message(STATUS "OpenFST lib dir: ${openfst_PREFIX_DIR}/lib")
message(STATUS "OpenFST lib dir: ${openfst_PREFIX_DIR}/lib")

@ -63,9 +63,7 @@ void CTCPrefixBeamSearch::Reset() {
times_.emplace_back(empty);
}
void CTCPrefixBeamSearch::InitDecoder() {
Reset();
}
void CTCPrefixBeamSearch::InitDecoder() { Reset(); }
void CTCPrefixBeamSearch::AdvanceDecode(
const std::shared_ptr<kaldi::DecodableInterface>& decodable) {

@ -29,6 +29,11 @@ TLGDecoder::TLGDecoder(TLGDecoderOptions opts) : opts_(opts) {
void TLGDecoder::Reset() {
decoder_->InitDecoding();
hypotheses_.clear();
likelihood_.clear();
olabels_.clear();
times_.clear();
num_frame_decoded_ = 0;
return;
}
@ -103,7 +108,7 @@ void TLGDecoder::FinalizeSearch() {
time.push_back(idx); // fake time, todo later
hypotheses_.push_back(hypothese);
times_.push_back(time);
olabels.push_back(words_id);
olabels_.push_back(words_id);
likelihood_.push_back(-(weight.Value2() + weight.Value1()));
}
}

@ -24,6 +24,7 @@ DECLARE_string(graph_path);
DECLARE_int32(max_active);
DECLARE_double(beam);
DECLARE_double(lattice_beam);
DECLARE_int32(nbest);
namespace ppspeech {
@ -46,7 +47,7 @@ struct TLGDecoderOptions {
decoder_opts.opts.max_active = FLAGS_max_active;
decoder_opts.opts.beam = FLAGS_beam;
decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
// decoder_opts.nbest = FLAGS_lattice_nbest;
decoder_opts.nbest = FLAGS_nbest;
LOG(INFO) << "LatticeFasterDecoder max active: "
<< decoder_opts.opts.max_active;
LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam;
@ -85,7 +86,7 @@ class TLGDecoder : public DecoderBase {
return hypotheses_;
}
const std::vector<std::vector<int>>& Outputs() const override {
return olabels;
return olabels_;
} // outputs_; }
const std::vector<float>& Likelihood() const override {
return likelihood_;
@ -111,8 +112,9 @@ class TLGDecoder : public DecoderBase {
private:
void AdvanceDecoding(kaldi::DecodableInterface* decodable);
int num_frame_decoded_;
std::vector<std::vector<int>> hypotheses_;
std::vector<std::vector<int>> olabels;
std::vector<std::vector<int>> olabels_;
std::vector<float> likelihood_;
std::vector<std::vector<int>> times_;
@ -123,4 +125,4 @@ class TLGDecoder : public DecoderBase {
};
} // namespace ppspeech
} // namespace ppspeech

@ -15,7 +15,6 @@
#pragma once
#include "base/common.h"
//#include "decoder/ctc_tlg_decoder.h"
// feature
DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");

@ -13,13 +13,13 @@
// limitations under the License.
#include "nnet/u2_nnet.h"
#include "base/common.h"
#include "decoder/param.h"
#include "frontend/assembler.h"
#include "frontend/data_cache.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/u2_nnet.h"
DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
@ -93,9 +93,9 @@ int main(int argc, char* argv[]) {
ori_feature_len - chunk_idx * chunk_stride, chunk_size);
}
if (this_chunk_size < receptive_field_length) {
LOG(WARNING)
<< "utt: " << utt << " skip last " << this_chunk_size
<< " frames, expect is " << receptive_field_length;
LOG(WARNING) << "utt: " << utt << " skip last "
<< this_chunk_size << " frames, expect is "
<< receptive_field_length;
break;
}

@ -13,13 +13,13 @@
// limitations under the License.
#include "nnet/u2_nnet.h"
#include "base/common.h"
#include "decoder/param.h"
#include "frontend/wave-reader.h"
#include "frontend/feature_pipeline.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/u2_nnet.h"
#include "nnet/nnet_producer.h"
DEFINE_string(wav_rspecifier, "", "test wav rspecifier");
@ -104,7 +104,7 @@ int main(int argc, char* argv[]) {
CHECK(sample_offset == tot_samples);
std::vector<std::vector<kaldi::BaseFloat>> prob_vec;
while(1) {
while (1) {
std::vector<kaldi::BaseFloat> logprobs;
bool isok = nnet_producer->Read(&logprobs);
if (nnet_producer->IsFinished()) break;

@ -33,12 +33,12 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource)
decodable_.reset(new Decodable(nnet_producer_, am_scale));
CHECK_NE(resource.vocab_path, "");
if (resource.decoder_opts.tlg_decoder_opts.fst_path == "") {
if (resource.decoder_opts.tlg_decoder_opts.fst_path.empty()) {
LOG(INFO) << resource.decoder_opts.tlg_decoder_opts.fst_path;
decoder_.reset(new CTCPrefixBeamSearch(
resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
decoder_ = std::make_unique<CTCPrefixBeamSearch>(
resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts);
} else {
decoder_.reset(new TLGDecoder(resource.decoder_opts.tlg_decoder_opts));
decoder_ = std::make_unique<TLGDecoder>(resource.decoder_opts.tlg_decoder_opts);
}
symbol_table_ = decoder_->WordSymbolTable();
@ -268,4 +268,4 @@ void U2Recognizer::SetInputFinished() {
}
} // namespace ppspeech
} // namespace ppspeech

@ -31,11 +31,9 @@ DECLARE_double(rescoring_weight);
DECLARE_double(reverse_weight);
DECLARE_int32(nbest);
DECLARE_int32(blank);
DECLARE_double(acoustic_scale);
DECLARE_string(vocab_path);
DECLARE_string(word_symbol_table);
// DECLARE_string(fst_path);
namespace ppspeech {
@ -74,10 +72,6 @@ struct DecodeOptions {
decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank;
decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest;
decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest;
// decoder_opts.tlg_decoder_opts.fst_path = "";//FLAGS_fst_path;
// decoder_opts.tlg_decoder_opts.word_symbol_table =
// FLAGS_word_symbol_table;
// decoder_opts.tlg_decoder_opts.nbest = FLAGS_nbest;
decoder_opts.tlg_decoder_opts =
ppspeech::TLGDecoderOptions::InitFromFlags();
@ -183,4 +177,4 @@ class U2Recognizer {
std::thread thread_;
};
} // namespace ppspeech
} // namespace ppspeech

@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "recognizer/u2_recognizer.h"
#include "decoder/param.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "recognizer/u2_recognizer.h"
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier");

@ -100,7 +100,7 @@ int main(int argc, char* argv[]) {
continue;
}
tot_decode_time += local_timer.Elapsed();
tot_decode_time += local_timer.Elapsed();
LOG(INFO) << utt << " " << result;
LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur
<< " cost: " << local_timer.Elapsed();

@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "websocket/websocket_client.h"
#include "kaldi/feat/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"
#include "websocket/websocket_client.h"
DEFINE_string(host, "127.0.0.1", "host of websocket server");
DEFINE_int32(port, 8082, "port of websocket server");

@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "decoder/param.h"
#include "websocket/websocket_server.h"
#include "decoder/param.h"
DEFINE_int32(port, 8082, "websocket listening port");

@ -28,7 +28,7 @@ typedef int int32; // NOLINT
#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
typedef long int64; // NOLINT
#else
typedef long long int64; // NOLINT
typedef long long int64; // NOLINT
#endif
typedef unsigned char uint8; // NOLINT

@ -21,6 +21,8 @@
#include <cstring>
#include <deque>
#include <fstream>
#include <functional>
#include <future>
#include <iomanip>
#include <iostream>
#include <istream>
@ -42,8 +44,6 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <future>
#include <functional>
#include "base/basic_types.h"
#include "base/flags.h"

@ -97,8 +97,8 @@ bool Assembler::Compute(vector<BaseFloat>* feats) {
CHECK(val.size() == dim_) << val.size();
int32 start = counter * dim_;
std::memcpy(feats->data() + start,
val.data(), val.size() * sizeof(BaseFloat));
std::memcpy(
feats->data() + start, val.data(), val.size() * sizeof(BaseFloat));
if (this_chunk_size - counter <= cache_size_) {
feature_cache_.push(val);

@ -84,11 +84,12 @@ void CMVN::Compute(vector<BaseFloat>* feats) const {
KALDI_ASSERT(feats != NULL);
if (feats->size() % dim_ != 0) {
LOG(ERROR)<< "Dim mismatch: cmvn " << mean_stats_.size() << ','
<< var_stats_.size() - 1 << ", feats " << feats->size() << 'x';
LOG(ERROR) << "Dim mismatch: cmvn " << mean_stats_.size() << ','
<< var_stats_.size() - 1 << ", feats " << feats->size()
<< 'x';
}
if (var_stats_.size() == 0 && var_norm_) {
LOG(ERROR)
LOG(ERROR)
<< "You requested variance normalization but no variance stats_ "
<< "are supplied.";
}
@ -98,8 +99,8 @@ void CMVN::Compute(vector<BaseFloat>* feats) const {
// computing an offset and representing it as stats_, we use a count of one.
if (count < 1.0)
LOG(ERROR) << "Insufficient stats_ for cepstral mean and variance "
"normalization: "
<< "count = " << count;
"normalization: "
<< "count = " << count;
if (!var_norm_) {
vector<BaseFloat> offset(feats->size());
@ -112,11 +113,12 @@ void CMVN::Compute(vector<BaseFloat>* feats) const {
// with the dim_ of feature.
// the dim_ of feats = dim_ * num_frames;
for (int32 idx = 0; idx < feats->size() / dim_; ++idx) {
std::memcpy(mean_stats_apply.data() + dim_ * idx,
mean_stats.data(), dim_* sizeof(double));
std::memcpy(mean_stats_apply.data() + dim_ * idx,
mean_stats.data(),
dim_ * sizeof(double));
}
for (size_t idx = 0; idx < feats->size(); ++idx) {
feats->at(idx) += offset[idx];
feats->at(idx) += offset[idx];
}
return;
}
@ -130,7 +132,7 @@ void CMVN::Compute(vector<BaseFloat>* feats) const {
double var = (var_stats_[d] / count) - mean * mean, floor = 1.0e-20;
if (var < floor) {
LOG(WARNING) << "Flooring cepstral variance from " << var << " to "
<< floor;
<< floor;
var = floor;
}
scale = 1.0 / sqrt(var);
@ -146,7 +148,7 @@ void CMVN::Compute(vector<BaseFloat>* feats) const {
}
// Apply the normalization.
for (size_t idx = 0; idx < feats->size(); ++idx) {
feats->at(idx) *= norm1[idx];
feats->at(idx) *= norm1[idx];
}
for (size_t idx = 0; idx < feats->size(); ++idx) {

@ -15,8 +15,8 @@
#pragma once
#include "base/common.h"
#include "frontend/feature_common.h"
#include "frontend/feature-fbank.h"
#include "frontend/feature_common.h"
namespace ppspeech {

@ -67,7 +67,7 @@ bool FeatureCache::Compute() {
for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
int32 start = chunk_idx * dim_;
vector<BaseFloat> feature_chunk(feature.data() + start,
vector<BaseFloat> feature_chunk(feature.data() + start,
feature.data() + start + dim_);
// feed cache
cache_.push(feature_chunk);

@ -57,7 +57,7 @@ class FeatureCache : public FrontendInterface {
bool Compute();
int32 dim_;
size_t max_size_; // cache capacity
size_t max_size_; // cache capacity
std::unique_ptr<FrontendInterface> base_extractor_;
std::queue<std::vector<BaseFloat>> cache_; // feature cache

@ -14,8 +14,8 @@
#pragma once
#include "frontend_itf.h"
#include "frontend/feature-window.h"
#include "frontend_itf.h"
namespace ppspeech {

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save