From 6e0044be582ee12846a74d70e1ba024b11b561a3 Mon Sep 17 00:00:00 2001
From: YangZhou <56786796+SmileGoat@users.noreply.github.com>
Date: Mon, 24 Apr 2023 19:23:21 +0800
Subject: [PATCH 1/2] [engine] merge develop into speechx (#3198)
*merge develop into speechx
---
.github/CODE_OF_CONDUCT.md | 77 +
.github/CONTRIBUTING.md | 30 +
.github/ISSUE_TEMPLATE/bug-report-tts.md | 1 -
.github/stale.yml | 5 +-
.gitignore | 1 +
.pre-commit-hooks/copyright-check.hook | 4 +-
README.md | 88 +-
README_cn.md | 91 +-
audio/CMakeLists.txt | 10 +-
audio/README.md | 21 +-
audio/paddleaudio/CMakeLists.txt | 16 -
audio/paddleaudio/_internal/module_utils.py | 13 +-
.../paddleaudio/backends/soundfile_backend.py | 2 +-
audio/paddleaudio/kaldi/__init__.py | 2 +-
audio/paddleaudio/kaldi/kaldi.py | 95 +-
audio/paddleaudio/src/CMakeLists.txt | 18 +-
.../src/pybind/kaldi/feature_common.h | 16 +-
.../src/pybind/kaldi/feature_common_inl.h | 89 +-
.../src/pybind/kaldi/kaldi_feature.cc | 40 +-
.../src/pybind/kaldi/kaldi_feature.h | 16 +-
.../src/pybind/kaldi/kaldi_feature_wrapper.cc | 21 +-
.../src/pybind/kaldi/kaldi_feature_wrapper.h | 8 +-
audio/paddleaudio/src/pybind/pybind.cpp | 90 +-
audio/paddleaudio/third_party/CMakeLists.txt | 5 +-
.../kaldi-native-fbank/csrc/CMakeLists.txt | 22 +
.../kaldi-native-fbank/csrc/feature-fbank.cc | 117 +
.../kaldi-native-fbank/csrc/feature-fbank.h | 132 +
.../csrc/feature-functions.cc | 49 +
.../csrc/feature-functions.h | 38 +
.../kaldi-native-fbank/csrc/feature-window.cc | 236 ++
.../kaldi-native-fbank/csrc/feature-window.h | 178 +
.../kaldi-native-fbank/csrc/fftsg.c | 3271 +++++++++++++++++
.../kaldi-native-fbank/csrc/log.cc | 143 +
.../third_party/kaldi-native-fbank/csrc/log.h | 347 ++
.../csrc/mel-computations.cc | 256 ++
.../csrc/mel-computations.h | 115 +
.../kaldi-native-fbank/csrc/rfft.cc | 66 +
.../kaldi-native-fbank/csrc/rfft.h | 56 +
.../third_party/kaldi/CMakeLists.txt | 111 -
audio/setup.py | 33 +-
dataset/aidatatang_200zh/aidatatang_200zh.py | 136 +-
dataset/aishell/README.md | 3 -
dataset/aishell/aishell.py | 140 +-
dataset/librispeech/librispeech.py | 4 +-
dataset/mini_librispeech/mini_librispeech.py | 4 +-
dataset/musan/musan.py | 4 +-
dataset/rir_noise/rir_noise.py | 4 +-
dataset/tal_cs/README.md | 13 +
dataset/tal_cs/tal_cs.py | 116 +
dataset/thchs30/thchs30.py | 4 +-
dataset/timit/timit.py | 2 +-
dataset/voxceleb/voxceleb1.py | 6 +-
dataset/voxceleb/voxceleb2.py | 6 +-
dataset/voxforge/voxforge.py | 6 +-
demos/TTSAndroid/README.md | 9 +-
demos/TTSArmLinux/.gitignore | 8 +
demos/TTSArmLinux/README.md | 91 +
demos/TTSArmLinux/build-depends.sh | 1 +
demos/TTSArmLinux/build.sh | 29 +
demos/TTSArmLinux/clean.sh | 23 +
demos/TTSArmLinux/config.sh | 15 +
demos/TTSArmLinux/download.sh | 70 +
demos/TTSArmLinux/front.conf | 21 +
demos/TTSArmLinux/run.sh | 19 +
demos/TTSArmLinux/src/CMakeLists.txt | 80 +
demos/TTSArmLinux/src/Predictor.hpp | 320 ++
demos/TTSArmLinux/src/TTSCppFrontend | 1 +
demos/TTSArmLinux/src/main.cc | 162 +
demos/TTSArmLinux/src/third-party | 1 +
demos/TTSCppFrontend/.gitignore | 2 +
demos/TTSCppFrontend/CMakeLists.txt | 63 +
demos/TTSCppFrontend/README.md | 56 +
demos/TTSCppFrontend/build-depends.sh | 20 +
demos/TTSCppFrontend/build.sh | 21 +
demos/TTSCppFrontend/clean.sh | 10 +
demos/TTSCppFrontend/download.sh | 62 +
demos/TTSCppFrontend/front_demo/front.conf | 21 +
.../TTSCppFrontend/front_demo/front_demo.cpp | 79 +
.../gentools/gen_dict_paddlespeech.py | 111 +
.../front_demo/gentools/genid.py | 35 +
.../front_demo/gentools/word2phones.py | 55 +
demos/TTSCppFrontend/run_front_demo.sh | 7 +
demos/TTSCppFrontend/src/base/type_conv.cpp | 28 +
demos/TTSCppFrontend/src/base/type_conv.h | 31 +
.../src/front/front_interface.cpp | 1130 ++++++
.../src/front/front_interface.h | 198 +
.../src/front/text_normalize.cpp | 542 +++
.../TTSCppFrontend/src/front/text_normalize.h | 77 +
.../TTSCppFrontend/third-party/CMakeLists.txt | 64 +
.../audio_searching/src/test_audio_search.py | 4 +-
demos/audio_searching/src/test_vpr_search.py | 4 +-
demos/speech_recognition/README.md | 28 +-
demos/speech_recognition/README_cn.md | 29 +-
demos/speech_recognition/run.sh | 6 +
demos/speech_web/README.md | 2 +-
demos/speech_web/speech_server/main.py | 2 +-
.../speech_web/speech_server/requirements.txt | 4 +-
demos/streaming_asr_server/README.md | 357 +-
demos/streaming_asr_server/README_cn.md | 353 +-
.../local/websocket_client_srt.py | 162 +
demos/text_to_speech/README.md | 29 +-
demos/text_to_speech/README_cn.md | 29 +-
docker/ubuntu18-cpu/Dockerfile | 2 +-
docs/images/note_map.png | Bin 0 -> 301021 bytes
docs/requirements.txt | 23 +-
docs/source/released_model.md | 41 +-
docs/source/tts/quick_start.md | 4 +-
docs/source/tts/quick_start_cn.md | 4 +-
docs/source/tts/svs_music_score.md | 183 +
docs/tutorial/st/st_tutorial.ipynb | 2 +-
docs/tutorial/tts/tts_tutorial.ipynb | 2 +-
examples/aishell/asr0/local/train.sh | 2 +-
.../asr1/conf/chunk_squeezeformer.yaml | 98 +
examples/aishell/asr1/conf/squeezeformer.yaml | 93 +
examples/aishell/asr1/local/test.sh | 15 +-
examples/aishell/asr1/local/train.sh | 2 +-
examples/aishell/asr3/README.md | 198 +
examples/aishell/asr3/RESULT.md | 18 +
examples/aishell/asr3/cmd.sh | 89 +
examples/aishell/asr3/conf/preprocess.yaml | 3 +
.../aishell/asr3/conf/train_with_wav2vec.yaml | 101 +
examples/aishell/asr3/conf/tuning/decode.yaml | 4 +
examples/aishell/asr3/conf/wav2vec2ASR.yaml | 167 +
.../asr3/conf/wav2vec2ASR_adadelta.yaml | 168 +
.../aishell/asr3/local/aishell_prepare.py | 129 +
examples/aishell/asr3/local/data.sh | 101 +
examples/aishell/asr3/local/test.sh | 82 +
examples/aishell/asr3/local/test_wav.sh | 58 +
examples/aishell/asr3/local/train.sh | 59 +
examples/aishell/asr3/path.sh | 15 +
examples/aishell/asr3/run.sh | 48 +
examples/aishell/asr3/utils | 1 +
examples/aishell3/tts3/path.sh | 14 +-
examples/aishell3/tts3/run.sh | 5 +-
examples/aishell3/vc0/path.sh | 14 +-
examples/aishell3/vc1/local/train.sh | 14 +-
examples/aishell3/vc1/path.sh | 14 +-
examples/aishell3/vc2/local/synthesize.sh | 21 +-
examples/aishell3/vc2/local/train.sh | 14 +-
examples/aishell3/vc2/path.sh | 14 +-
.../aishell3/vits/local/synthesize_e2e.sh | 1 +
examples/aishell3/voc1/local/preprocess.sh | 10 +-
examples/aishell3/voc1/local/synthesize.sh | 15 +-
examples/aishell3/voc1/local/train.sh | 14 +-
examples/aishell3/voc1/path.sh | 14 +-
examples/aishell3/voc5/local/preprocess.sh | 56 +-
examples/aishell3/voc5/local/synthesize.sh | 15 +-
examples/aishell3/voc5/local/train.sh | 14 +-
examples/aishell3/voc5/path.sh | 14 +-
.../ernie_sat/local/synthesize.sh | 26 +-
.../aishell3_vctk/ernie_sat/local/train.sh | 13 +-
examples/aishell3_vctk/ernie_sat/path.sh | 14 +-
examples/canton/tts3/README.md | 127 +
examples/canton/tts3/conf/default.yaml | 107 +
examples/canton/tts3/local/inference.sh | 63 +
examples/canton/tts3/local/ort_predict.sh | 49 +
examples/canton/tts3/local/paddle2onnx.sh | 1 +
examples/canton/tts3/local/preprocess.sh | 75 +
examples/canton/tts3/local/synthesize.sh | 1 +
examples/canton/tts3/local/synthesize_e2e.sh | 53 +
examples/canton/tts3/local/train.sh | 1 +
examples/canton/tts3/path.sh | 1 +
examples/canton/tts3/run.sh | 62 +
examples/csmsc/jets/README.md | 108 +
examples/csmsc/jets/conf/default.yaml | 224 ++
examples/csmsc/jets/local/inference.sh | 15 +
examples/csmsc/jets/local/preprocess.sh | 77 +
examples/csmsc/jets/local/synthesize.sh | 18 +
examples/csmsc/jets/local/synthesize_e2e.sh | 22 +
examples/csmsc/jets/local/train.sh | 12 +
examples/csmsc/jets/path.sh | 13 +
examples/csmsc/jets/run.sh | 41 +
examples/csmsc/tts2/run.sh | 5 +-
examples/csmsc/tts3/local/PTQ_static.sh | 2 +-
examples/csmsc/tts3/run.sh | 5 +-
examples/csmsc/tts3/run_cnndecoder.sh | 10 +-
examples/csmsc/vits/README.md | 16 +-
examples/csmsc/vits/local/export2lite.sh | 1 +
examples/csmsc/vits/local/inference.sh | 17 +
examples/csmsc/vits/local/lite_predict.sh | 18 +
examples/csmsc/vits/local/paddle2onnx.sh | 1 +
examples/csmsc/vits/local/synthesize_e2e.sh | 4 +-
examples/csmsc/vits/run.sh | 32 +
examples/csmsc/voc1/local/PTQ_static.sh | 2 +-
examples/csmsc/voc1/local/preprocess.sh | 10 +-
examples/csmsc/voc3/finetune.sh | 65 +-
examples/csmsc/voc3/local/preprocess.sh | 56 +-
examples/csmsc/voc3/local/train.sh | 14 +-
examples/csmsc/voc4/local/preprocess.sh | 56 +-
examples/csmsc/voc4/local/train.sh | 14 +-
examples/csmsc/voc5/conf/iSTFT.yaml | 174 +
examples/csmsc/voc5/finetune.sh | 9 +-
examples/csmsc/voc5/iSTFTNet.md | 145 +
examples/csmsc/voc5/local/preprocess.sh | 56 +-
examples/csmsc/voc5/local/train.sh | 14 +-
examples/csmsc/voc6/local/preprocess.sh | 10 +-
examples/csmsc/voc6/local/train.sh | 14 +-
examples/librispeech/asr2/README.md | 2 +-
examples/librispeech/asr3/local/data.sh | 0
examples/librispeech/asr3/local/test.sh | 0
examples/librispeech/asr3/local/test_wav.sh | 0
examples/librispeech/asr3/local/train.sh | 0
examples/librispeech/asr3/run.sh | 2 +-
examples/ljspeech/tts0/local/train.sh | 13 +-
examples/ljspeech/tts0/path.sh | 14 +-
examples/ljspeech/tts3/local/train.sh | 13 +-
examples/ljspeech/tts3/path.sh | 14 +-
examples/ljspeech/tts3/run.sh | 5 +-
examples/ljspeech/voc1/local/preprocess.sh | 10 +-
examples/ljspeech/voc1/local/synthesize.sh | 15 +-
examples/ljspeech/voc1/local/train.sh | 14 +-
examples/ljspeech/voc1/path.sh | 14 +-
examples/ljspeech/voc5/local/preprocess.sh | 56 +-
examples/ljspeech/voc5/local/synthesize.sh | 15 +-
examples/ljspeech/voc5/local/train.sh | 14 +-
examples/ljspeech/voc5/path.sh | 14 +-
examples/opencpop/README.md | 6 +
examples/opencpop/svs1/README.md | 276 ++
examples/opencpop/svs1/README_cn.md | 280 ++
examples/opencpop/svs1/conf/default.yaml | 159 +
.../opencpop/svs1/local/pinyin_to_phone.txt | 418 +++
examples/opencpop/svs1/local/preprocess.sh | 74 +
examples/opencpop/svs1/local/synthesize.sh | 27 +
.../opencpop/svs1/local/synthesize_e2e.sh | 53 +
examples/opencpop/svs1/local/train.sh | 13 +
examples/opencpop/svs1/path.sh | 13 +
examples/opencpop/svs1/run.sh | 37 +
examples/opencpop/voc1/README.md | 139 +
examples/opencpop/voc1/conf/default.yaml | 119 +
examples/opencpop/voc1/local/PTQ_static.sh | 1 +
.../opencpop/voc1/local/dygraph_to_static.sh | 15 +
examples/opencpop/voc1/local/preprocess.sh | 47 +
examples/opencpop/voc1/local/synthesize.sh | 1 +
examples/opencpop/voc1/local/train.sh | 1 +
examples/opencpop/voc1/path.sh | 1 +
examples/opencpop/voc1/run.sh | 42 +
examples/opencpop/voc5/conf/default.yaml | 167 +
examples/opencpop/voc5/conf/finetune.yaml | 168 +
examples/opencpop/voc5/finetune.sh | 74 +
examples/opencpop/voc5/local/PTQ_static.sh | 1 +
.../opencpop/voc5/local/dygraph_to_static.sh | 15 +
examples/opencpop/voc5/local/prepare_env.py | 1 +
examples/opencpop/voc5/local/preprocess.sh | 1 +
examples/opencpop/voc5/local/synthesize.sh | 1 +
examples/opencpop/voc5/local/train.sh | 1 +
examples/opencpop/voc5/path.sh | 1 +
examples/opencpop/voc5/run.sh | 42 +
examples/other/mfa/README.md | 7 +
.../local/generate_canton_lexicon_wavlabs.py | 82 +
examples/other/mfa/local/generate_lexicon.py | 2 +-
examples/other/mfa/run_canton.sh | 34 +
.../other/tn/data/textnorm_test_cases.txt | 4 +-
examples/other/tts_finetune/tts3/run.sh | 1 -
examples/other/tts_finetune/tts3/run_en.sh | 1 -
examples/other/tts_finetune/tts3/run_mix.sh | 1 -
examples/tal_cs/asr1/README.md | 190 +
examples/tal_cs/asr1/RESULTS.md | 12 +
examples/tal_cs/asr1/conf/conformer.yaml | 91 +
examples/tal_cs/asr1/conf/preprocess.yaml | 29 +
.../tal_cs/asr1/conf/tuning/chunk_decode.yaml | 12 +
examples/tal_cs/asr1/conf/tuning/decode.yaml | 12 +
examples/tal_cs/asr1/local/data.sh | 88 +
examples/tal_cs/asr1/local/test.sh | 72 +
examples/tal_cs/asr1/local/test_wav.sh | 58 +
examples/tal_cs/asr1/local/train.sh | 72 +
examples/tal_cs/asr1/path.sh | 15 +
examples/tal_cs/asr1/run.sh | 51 +
examples/tal_cs/asr1/utils | 1 +
examples/tess/cls0/local/train.py | 2 +-
examples/tiny/asr1/README.md | 2 +-
examples/vctk/README.md | 1 +
examples/vctk/ernie_sat/local/train.sh | 13 +-
examples/vctk/ernie_sat/path.sh | 14 +-
examples/vctk/tts3/local/train.sh | 14 +-
examples/vctk/tts3/path.sh | 14 +-
examples/vctk/tts3/run.sh | 5 +-
examples/vctk/vc3/README.md | 10 +
examples/vctk/vc3/conf/default.yaml | 135 +
examples/vctk/vc3/local/preprocess.sh | 37 +
examples/vctk/vc3/local/train.sh | 12 +
examples/vctk/vc3/local/voice_conversion.sh | 10 +
examples/vctk/vc3/path.sh | 13 +
examples/vctk/vc3/run.sh | 36 +
examples/vctk/voc1/local/preprocess.sh | 10 +-
examples/vctk/voc1/local/synthesize.sh | 15 +-
examples/vctk/voc1/local/train.sh | 14 +-
examples/vctk/voc1/path.sh | 14 +-
examples/vctk/voc5/local/preprocess.sh | 56 +-
examples/vctk/voc5/local/synthesize.sh | 15 +-
examples/vctk/voc5/local/train.sh | 14 +-
examples/vctk/voc5/path.sh | 14 +-
examples/zh_en_tts/tts3/local/train.sh | 14 +-
examples/zh_en_tts/tts3/path.sh | 14 +-
examples/zh_en_tts/tts3/run.sh | 5 +-
paddlespeech/__init__.py | 4 +
paddlespeech/cli/asr/infer.py | 28 +-
paddlespeech/cli/base_commands.py | 19 +-
paddlespeech/cli/download.py | 4 +-
paddlespeech/cli/text/infer.py | 6 +-
paddlespeech/cli/tts/infer.py | 54 +-
paddlespeech/cli/whisper/infer.py | 9 +-
paddlespeech/dataset/__init__.py | 0
.../dataset}/aidatatang_200zh/README.md | 0
.../dataset/aidatatang_200zh/__init__.py | 14 +
.../aidatatang_200zh/aidatatang_200zh.py | 158 +
paddlespeech/dataset/aishell/README.md | 58 +
paddlespeech/dataset/aishell/__init__.py | 18 +
paddlespeech/dataset/aishell/aishell.py | 230 ++
.../dataset/download.py | 89 +-
paddlespeech/dataset/s2t/__init__.py | 20 +
paddlespeech/dataset/s2t/avg_model.py | 125 +
paddlespeech/dataset/s2t/build_vocab.py | 166 +
paddlespeech/dataset/s2t/compute_mean_std.py | 106 +
paddlespeech/dataset/s2t/compute_wer.py | 558 +++
paddlespeech/dataset/s2t/format_data.py | 154 +
paddlespeech/dataset/s2t/format_rsl.py | 143 +
paddlespeech/resource/pretrained_models.py | 387 +-
paddlespeech/s2t/__init__.py | 2 +-
.../exps/deepspeech2/bin/deploy/runtime.py | 4 +-
.../s2t/exps/deepspeech2/bin/deploy/server.py | 4 +-
.../s2t/exps/deepspeech2/bin/export.py | 2 +-
paddlespeech/s2t/exps/deepspeech2/bin/test.py | 2 +-
.../s2t/exps/deepspeech2/bin/test_export.py | 2 +-
.../s2t/exps/deepspeech2/bin/test_wav.py | 2 +-
.../s2t/exps/deepspeech2/bin/train.py | 2 +-
paddlespeech/s2t/exps/u2/bin/alignment.py | 2 +-
paddlespeech/s2t/exps/u2/bin/export.py | 2 +-
paddlespeech/s2t/exps/u2/bin/test.py | 2 +-
paddlespeech/s2t/exps/u2/bin/train.py | 2 +-
paddlespeech/s2t/exps/u2_kaldi/bin/test.py | 2 +-
paddlespeech/s2t/exps/u2_kaldi/bin/train.py | 2 +-
paddlespeech/s2t/exps/u2_st/bin/export.py | 2 +-
paddlespeech/s2t/exps/u2_st/bin/test.py | 2 +-
paddlespeech/s2t/exps/u2_st/bin/train.py | 2 +-
paddlespeech/s2t/exps/wav2vec2/bin/test.py | 2 +-
.../s2t/exps/wav2vec2/bin/test_wav.py | 15 +-
paddlespeech/s2t/exps/wav2vec2/bin/train.py | 2 +-
paddlespeech/s2t/exps/wav2vec2/model.py | 408 +-
.../s2t/frontend/augmentor/augmentation.py | 2 +-
.../frontend/featurizer/text_featurizer.py | 17 +-
paddlespeech/s2t/io/dataloader.py | 2 +-
paddlespeech/s2t/io/speechbrain/__init__.py | 13 +
paddlespeech/s2t/io/speechbrain/batch.py | 107 +
.../s2t/io/speechbrain/data_pipeline.py | 488 +++
paddlespeech/s2t/io/speechbrain/data_utils.py | 177 +
paddlespeech/s2t/io/speechbrain/dataio.py | 845 +++++
paddlespeech/s2t/io/speechbrain/dataloader.py | 172 +
paddlespeech/s2t/io/speechbrain/dataset.py | 371 ++
paddlespeech/s2t/io/speechbrain/depgraph.py | 237 ++
.../s2t/io/speechbrain/make_dataloader.py | 118 +
paddlespeech/s2t/io/speechbrain/sampler.py | 503 +++
.../s2t/io/speechbrain/sb_pipeline.py | 156 +
paddlespeech/s2t/models/u2/u2.py | 8 +-
paddlespeech/s2t/models/u2_st/u2_st.py | 2 +-
paddlespeech/s2t/models/wav2vec2/__init__.py | 2 +-
.../wav2vec2/processing/signal_processing.py | 20 +-
.../processing/speech_augmentation.py | 44 +-
.../s2t/models/wav2vec2/wav2vec2_ASR.py | 61 +-
paddlespeech/s2t/models/whisper/__init__.py | 2 +-
paddlespeech/s2t/models/whisper/tokenizer.py | 6 +-
paddlespeech/s2t/models/whisper/utils.py | 2 +-
paddlespeech/s2t/models/whisper/whipser.py | 18 +-
paddlespeech/s2t/modules/attention.py | 45 +-
.../s2t/modules/conformer_convolution.py | 42 +-
paddlespeech/s2t/modules/conv2d.py | 62 +
paddlespeech/s2t/modules/encoder.py | 377 +-
paddlespeech/s2t/modules/encoder_layer.py | 127 +-
.../s2t/modules/positionwise_feed_forward.py | 32 +-
paddlespeech/s2t/modules/subsampling.py | 66 +-
paddlespeech/s2t/modules/time_reduction.py | 263 ++
paddlespeech/s2t/training/gradclip.py | 21 +-
.../{optimizer.py => optimizer/__init__.py} | 1 -
.../s2t/training/optimizer/adadelta.py | 239 ++
paddlespeech/s2t/training/scheduler.py | 1 -
paddlespeech/s2t/utils/utility.py | 57 +-
.../server/bin/paddlespeech_server.py | 30 +-
.../engine/asr/online/python/asr_engine.py | 2 +-
.../engine/tts/paddleinference/tts_engine.py | 6 +-
paddlespeech/server/ws/asr_api.py | 2 +-
paddlespeech/t2s/datasets/am_batch_fn.py | 338 +-
paddlespeech/t2s/datasets/data_table.py | 53 +
paddlespeech/t2s/datasets/get_feats.py | 26 +-
paddlespeech/t2s/datasets/preprocess_utils.py | 91 +
paddlespeech/t2s/exps/PTQ_static.py | 2 +
paddlespeech/t2s/exps/diffsinger/__init__.py | 13 +
.../t2s/exps/diffsinger/gen_gta_mel.py | 240 ++
.../t2s/exps/diffsinger/get_minmax.py | 82 +
paddlespeech/t2s/exps/diffsinger/normalize.py | 189 +
.../t2s/exps/diffsinger/preprocess.py | 377 ++
paddlespeech/t2s/exps/diffsinger/train.py | 257 ++
paddlespeech/t2s/exps/dygraph_to_static.py | 170 +
paddlespeech/t2s/exps/ernie_sat/preprocess.py | 1 +
.../t2s/exps/ernie_sat/synthesize_e2e.py | 2 +-
paddlespeech/t2s/exps/ernie_sat/train.py | 2 +-
.../t2s/exps/fastspeech2/preprocess.py | 29 +-
paddlespeech/t2s/exps/fastspeech2/train.py | 4 +-
.../t2s/exps/gan_vocoder/preprocess.py | 39 +-
paddlespeech/t2s/exps/inference.py | 33 +-
paddlespeech/t2s/exps/jets/__init__.py | 13 +
paddlespeech/t2s/exps/jets/inference.py | 172 +
paddlespeech/t2s/exps/jets/normalize.py | 163 +
paddlespeech/t2s/exps/jets/preprocess.py | 451 +++
paddlespeech/t2s/exps/jets/synthesize.py | 153 +
paddlespeech/t2s/exps/jets/synthesize_e2e.py | 189 +
paddlespeech/t2s/exps/jets/train.py | 305 ++
paddlespeech/t2s/exps/lite_syn_utils.py | 21 +-
paddlespeech/t2s/exps/ort_predict_e2e.py | 10 +-
paddlespeech/t2s/exps/sentences_canton.txt | 21 +
paddlespeech/t2s/exps/sentences_sing.txt | 2 +
.../t2s/exps/speedyspeech/preprocess.py | 1 +
.../t2s/exps/speedyspeech/synthesize_e2e.py | 6 +-
paddlespeech/t2s/exps/speedyspeech/train.py | 6 +-
.../t2s/exps/starganv2_vc/__init__.py | 13 +
.../t2s/exps/starganv2_vc/normalize.py | 101 +
.../t2s/exps/starganv2_vc/preprocess.py | 214 ++
paddlespeech/t2s/exps/starganv2_vc/train.py | 274 ++
paddlespeech/t2s/exps/starganv2_vc/vc.py | 264 ++
paddlespeech/t2s/exps/syn_utils.py | 271 +-
paddlespeech/t2s/exps/synthesize.py | 53 +-
paddlespeech/t2s/exps/synthesize_e2e.py | 68 +-
paddlespeech/t2s/exps/tacotron2/preprocess.py | 1 +
paddlespeech/t2s/exps/tacotron2/train.py | 2 +-
.../t2s/exps/transformer_tts/train.py | 2 +-
paddlespeech/t2s/exps/vits/inference.py | 174 +
paddlespeech/t2s/exps/vits/lite_predict.py | 148 +
paddlespeech/t2s/exps/vits/normalize.py | 2 +-
paddlespeech/t2s/exps/vits/preprocess.py | 3 +-
paddlespeech/t2s/exps/vits/synthesize_e2e.py | 28 +-
paddlespeech/t2s/exps/vits/train.py | 18 +-
paddlespeech/t2s/frontend/canton_frontend.py | 113 +
paddlespeech/t2s/frontend/g2pw/onnx_api.py | 2 +-
paddlespeech/t2s/frontend/generate_lexicon.py | 2 +-
paddlespeech/t2s/frontend/mix_frontend.py | 70 +-
paddlespeech/t2s/frontend/phonectic.py | 2 +-
paddlespeech/t2s/frontend/sing_frontend.py | 175 +
paddlespeech/t2s/frontend/zh_frontend.py | 6 +-
.../t2s/models/diffsinger/__init__.py | 15 +
.../t2s/models/diffsinger/diffsinger.py | 399 ++
.../models/diffsinger/diffsinger_updater.py | 302 ++
.../t2s/models/diffsinger/fastspeech2midi.py | 654 ++++
.../t2s/models/fastspeech2/fastspeech2.py | 15 +-
paddlespeech/t2s/models/hifigan/hifigan.py | 82 +-
paddlespeech/t2s/models/jets/__init__.py | 15 +
paddlespeech/t2s/models/jets/alignments.py | 182 +
paddlespeech/t2s/models/jets/generator.py | 897 +++++
paddlespeech/t2s/models/jets/jets.py | 582 +++
paddlespeech/t2s/models/jets/jets_updater.py | 437 +++
.../t2s/models/jets/length_regulator.py | 67 +
.../starganv2_vc/AuxiliaryASR/__init__.py | 13 +
.../starganv2_vc/AuxiliaryASR/config.yml | 29 +
.../starganv2_vc/AuxiliaryASR/layers.py | 262 ++
.../models/starganv2_vc/AuxiliaryASR/model.py | 247 ++
.../models/starganv2_vc/JDCNet/__init__.py | 13 +
.../t2s/models/starganv2_vc/JDCNet/model.py | 210 ++
.../t2s/models/starganv2_vc/__init__.py | 17 +
.../t2s/models/starganv2_vc/losses.py | 257 ++
.../t2s/models/starganv2_vc/starganv2_vc.py | 633 ++++
.../starganv2_vc/starganv2_vc_updater.py | 308 ++
.../t2s/models/starganv2_vc/transforms.py | 143 +
.../t2s/models/tacotron2/tacotron2_updater.py | 12 +-
.../t2s/models/vits/duration_predictor.py | 14 +-
paddlespeech/t2s/models/vits/flow.py | 9 +-
paddlespeech/t2s/models/vits/generator.py | 25 +-
paddlespeech/t2s/models/vits/text_encoder.py | 13 +-
paddlespeech/t2s/models/vits/transform.py | 84 +-
paddlespeech/t2s/models/vits/vits.py | 66 +-
paddlespeech/t2s/models/waveflow.py | 8 +-
paddlespeech/t2s/modules/activation.py | 3 +-
.../t2s/modules/conformer/encoder_layer.py | 4 -
paddlespeech/t2s/modules/diffnet.py | 244 ++
paddlespeech/t2s/modules/diffusion.py | 322 ++
paddlespeech/t2s/modules/losses.py | 198 +
paddlespeech/t2s/modules/masked_fill.py | 2 -
paddlespeech/t2s/modules/nets_utils.py | 42 +-
.../modules/predictor/variance_predictor.py | 2 +-
.../t2s/modules/tacotron2/attentions.py | 16 +-
paddlespeech/t2s/modules/tacotron2/decoder.py | 6 +-
.../t2s/modules/transformer/attention.py | 12 +-
.../t2s/modules/transformer/embedding.py | 3 +-
.../t2s/modules/transformer/encoder.py | 17 +-
.../t2s/modules/transformer/lightconv.py | 2 +-
.../modules/transformer/multi_layer_conv.py | 4 +-
paddlespeech/t2s/modules/wavenet_denoiser.py | 185 +
paddlespeech/t2s/ssml/xml_processor.py | 34 +
paddlespeech/text/models/ernie_crf/model.py | 2 +-
.../text/models/ernie_linear/ernie_linear.py | 4 +-
paddlespeech/utils/argparse.py | 100 +
paddlespeech/utils/initialize.py | 321 ++
paddlespeech/vector/exps/ecapa_tdnn/train.py | 4 +-
paddlespeech/vector/exps/ge2e/preprocess.py | 2 +-
.../asr/server/websocket/websocket_server.cc | 8 +-
runtime/engine/common/frontend/db_norm.cc | 2 +-
runtime/engine/kaldi/base/kaldi-types.h | 2 +-
runtime/engine/kaldi/lat/lattice-functions.h | 16 +-
runtime/engine/kaldi/util/kaldi-table-inl.h | 2 +-
setup.py | 49 +-
.../conformer/scripts/aishell_tiny.py | 4 +-
tests/test_tipc/prepare.sh | 3 +
tests/unit/cli/aishell_test_prepare.py | 4 +-
tests/unit/cli/test_cli.sh | 10 +-
third_party/ctc_decoders/setup.py | 7 +-
tools/extras/install_mkl.sh | 2 +-
utils/avg_model.py | 102 +-
utils/build_vocab.py | 131 +-
utils/compute-wer.py | 553 +--
utils/compute_mean_std.py | 72 +-
utils/format_data.py | 127 +-
utils/format_rsl.py | 95 +-
utils/format_triplet_data.py | 4 +-
utils/fst/ctc_token_fst.py | 2 +-
utils/manifest_key_value.py | 4 +-
utils/tokenizer.perl | 2 +-
512 files changed, 36596 insertions(+), 3615 deletions(-)
create mode 100644 .github/CODE_OF_CONDUCT.md
create mode 100644 .github/CONTRIBUTING.md
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
delete mode 100644 audio/paddleaudio/third_party/kaldi/CMakeLists.txt
delete mode 100644 dataset/aishell/README.md
create mode 100644 dataset/tal_cs/README.md
create mode 100644 dataset/tal_cs/tal_cs.py
create mode 100644 demos/TTSArmLinux/.gitignore
create mode 100644 demos/TTSArmLinux/README.md
create mode 120000 demos/TTSArmLinux/build-depends.sh
create mode 100755 demos/TTSArmLinux/build.sh
create mode 100755 demos/TTSArmLinux/clean.sh
create mode 100644 demos/TTSArmLinux/config.sh
create mode 100755 demos/TTSArmLinux/download.sh
create mode 100644 demos/TTSArmLinux/front.conf
create mode 100755 demos/TTSArmLinux/run.sh
create mode 100644 demos/TTSArmLinux/src/CMakeLists.txt
create mode 100644 demos/TTSArmLinux/src/Predictor.hpp
create mode 120000 demos/TTSArmLinux/src/TTSCppFrontend
create mode 100644 demos/TTSArmLinux/src/main.cc
create mode 120000 demos/TTSArmLinux/src/third-party
create mode 100644 demos/TTSCppFrontend/.gitignore
create mode 100644 demos/TTSCppFrontend/CMakeLists.txt
create mode 100644 demos/TTSCppFrontend/README.md
create mode 100755 demos/TTSCppFrontend/build-depends.sh
create mode 100755 demos/TTSCppFrontend/build.sh
create mode 100755 demos/TTSCppFrontend/clean.sh
create mode 100755 demos/TTSCppFrontend/download.sh
create mode 100644 demos/TTSCppFrontend/front_demo/front.conf
create mode 100644 demos/TTSCppFrontend/front_demo/front_demo.cpp
create mode 100644 demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py
create mode 100644 demos/TTSCppFrontend/front_demo/gentools/genid.py
create mode 100644 demos/TTSCppFrontend/front_demo/gentools/word2phones.py
create mode 100755 demos/TTSCppFrontend/run_front_demo.sh
create mode 100644 demos/TTSCppFrontend/src/base/type_conv.cpp
create mode 100644 demos/TTSCppFrontend/src/base/type_conv.h
create mode 100644 demos/TTSCppFrontend/src/front/front_interface.cpp
create mode 100644 demos/TTSCppFrontend/src/front/front_interface.h
create mode 100644 demos/TTSCppFrontend/src/front/text_normalize.cpp
create mode 100644 demos/TTSCppFrontend/src/front/text_normalize.h
create mode 100644 demos/TTSCppFrontend/third-party/CMakeLists.txt
create mode 100644 demos/streaming_asr_server/local/websocket_client_srt.py
create mode 100644 docs/images/note_map.png
create mode 100644 docs/source/tts/svs_music_score.md
create mode 100644 examples/aishell/asr1/conf/chunk_squeezeformer.yaml
create mode 100644 examples/aishell/asr1/conf/squeezeformer.yaml
create mode 100644 examples/aishell/asr3/README.md
create mode 100644 examples/aishell/asr3/RESULT.md
create mode 100755 examples/aishell/asr3/cmd.sh
create mode 100755 examples/aishell/asr3/conf/preprocess.yaml
create mode 100755 examples/aishell/asr3/conf/train_with_wav2vec.yaml
create mode 100755 examples/aishell/asr3/conf/tuning/decode.yaml
create mode 100644 examples/aishell/asr3/conf/wav2vec2ASR.yaml
create mode 100755 examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml
create mode 100644 examples/aishell/asr3/local/aishell_prepare.py
create mode 100755 examples/aishell/asr3/local/data.sh
create mode 100755 examples/aishell/asr3/local/test.sh
create mode 100755 examples/aishell/asr3/local/test_wav.sh
create mode 100755 examples/aishell/asr3/local/train.sh
create mode 100755 examples/aishell/asr3/path.sh
create mode 100755 examples/aishell/asr3/run.sh
create mode 120000 examples/aishell/asr3/utils
mode change 100755 => 120000 examples/aishell3/tts3/path.sh
mode change 100755 => 120000 examples/aishell3/vc0/path.sh
mode change 100755 => 120000 examples/aishell3/vc1/local/train.sh
mode change 100755 => 120000 examples/aishell3/vc1/path.sh
mode change 100755 => 120000 examples/aishell3/vc2/local/synthesize.sh
mode change 100755 => 120000 examples/aishell3/vc2/local/train.sh
mode change 100755 => 120000 examples/aishell3/vc2/path.sh
mode change 100755 => 120000 examples/aishell3/voc1/local/synthesize.sh
mode change 100755 => 120000 examples/aishell3/voc1/local/train.sh
mode change 100755 => 120000 examples/aishell3/voc1/path.sh
mode change 100755 => 120000 examples/aishell3/voc5/local/preprocess.sh
mode change 100755 => 120000 examples/aishell3/voc5/local/synthesize.sh
mode change 100755 => 120000 examples/aishell3/voc5/local/train.sh
mode change 100755 => 120000 examples/aishell3/voc5/path.sh
mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/synthesize.sh
mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/train.sh
mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/path.sh
create mode 100644 examples/canton/tts3/README.md
create mode 100644 examples/canton/tts3/conf/default.yaml
create mode 100755 examples/canton/tts3/local/inference.sh
create mode 100755 examples/canton/tts3/local/ort_predict.sh
create mode 120000 examples/canton/tts3/local/paddle2onnx.sh
create mode 100755 examples/canton/tts3/local/preprocess.sh
create mode 120000 examples/canton/tts3/local/synthesize.sh
create mode 100755 examples/canton/tts3/local/synthesize_e2e.sh
create mode 120000 examples/canton/tts3/local/train.sh
create mode 120000 examples/canton/tts3/path.sh
create mode 100755 examples/canton/tts3/run.sh
create mode 100644 examples/csmsc/jets/README.md
create mode 100644 examples/csmsc/jets/conf/default.yaml
create mode 100755 examples/csmsc/jets/local/inference.sh
create mode 100755 examples/csmsc/jets/local/preprocess.sh
create mode 100755 examples/csmsc/jets/local/synthesize.sh
create mode 100755 examples/csmsc/jets/local/synthesize_e2e.sh
create mode 100755 examples/csmsc/jets/local/train.sh
create mode 100755 examples/csmsc/jets/path.sh
create mode 100755 examples/csmsc/jets/run.sh
create mode 120000 examples/csmsc/vits/local/export2lite.sh
create mode 100755 examples/csmsc/vits/local/inference.sh
create mode 100755 examples/csmsc/vits/local/lite_predict.sh
create mode 120000 examples/csmsc/vits/local/paddle2onnx.sh
mode change 100755 => 120000 examples/csmsc/voc3/finetune.sh
mode change 100755 => 120000 examples/csmsc/voc3/local/preprocess.sh
mode change 100755 => 120000 examples/csmsc/voc3/local/train.sh
mode change 100755 => 120000 examples/csmsc/voc4/local/preprocess.sh
mode change 100755 => 120000 examples/csmsc/voc4/local/train.sh
create mode 100644 examples/csmsc/voc5/conf/iSTFT.yaml
create mode 100644 examples/csmsc/voc5/iSTFTNet.md
mode change 100755 => 120000 examples/csmsc/voc5/local/preprocess.sh
mode change 100755 => 120000 examples/csmsc/voc5/local/train.sh
mode change 100755 => 120000 examples/csmsc/voc6/local/train.sh
mode change 100644 => 100755 examples/librispeech/asr3/local/data.sh
mode change 100644 => 100755 examples/librispeech/asr3/local/test.sh
mode change 100644 => 100755 examples/librispeech/asr3/local/test_wav.sh
mode change 100644 => 100755 examples/librispeech/asr3/local/train.sh
mode change 100755 => 120000 examples/ljspeech/tts0/local/train.sh
mode change 100755 => 120000 examples/ljspeech/tts0/path.sh
mode change 100755 => 120000 examples/ljspeech/tts3/local/train.sh
mode change 100755 => 120000 examples/ljspeech/tts3/path.sh
mode change 100755 => 120000 examples/ljspeech/voc1/local/synthesize.sh
mode change 100755 => 120000 examples/ljspeech/voc1/local/train.sh
mode change 100755 => 120000 examples/ljspeech/voc1/path.sh
mode change 100755 => 120000 examples/ljspeech/voc5/local/preprocess.sh
mode change 100755 => 120000 examples/ljspeech/voc5/local/synthesize.sh
mode change 100755 => 120000 examples/ljspeech/voc5/local/train.sh
mode change 100755 => 120000 examples/ljspeech/voc5/path.sh
create mode 100644 examples/opencpop/README.md
create mode 100644 examples/opencpop/svs1/README.md
create mode 100644 examples/opencpop/svs1/README_cn.md
create mode 100644 examples/opencpop/svs1/conf/default.yaml
create mode 100644 examples/opencpop/svs1/local/pinyin_to_phone.txt
create mode 100755 examples/opencpop/svs1/local/preprocess.sh
create mode 100755 examples/opencpop/svs1/local/synthesize.sh
create mode 100755 examples/opencpop/svs1/local/synthesize_e2e.sh
create mode 100755 examples/opencpop/svs1/local/train.sh
create mode 100755 examples/opencpop/svs1/path.sh
create mode 100755 examples/opencpop/svs1/run.sh
create mode 100644 examples/opencpop/voc1/README.md
create mode 100644 examples/opencpop/voc1/conf/default.yaml
create mode 120000 examples/opencpop/voc1/local/PTQ_static.sh
create mode 100755 examples/opencpop/voc1/local/dygraph_to_static.sh
create mode 100755 examples/opencpop/voc1/local/preprocess.sh
create mode 120000 examples/opencpop/voc1/local/synthesize.sh
create mode 120000 examples/opencpop/voc1/local/train.sh
create mode 120000 examples/opencpop/voc1/path.sh
create mode 100755 examples/opencpop/voc1/run.sh
create mode 100644 examples/opencpop/voc5/conf/default.yaml
create mode 100644 examples/opencpop/voc5/conf/finetune.yaml
create mode 100755 examples/opencpop/voc5/finetune.sh
create mode 120000 examples/opencpop/voc5/local/PTQ_static.sh
create mode 100755 examples/opencpop/voc5/local/dygraph_to_static.sh
create mode 120000 examples/opencpop/voc5/local/prepare_env.py
create mode 120000 examples/opencpop/voc5/local/preprocess.sh
create mode 120000 examples/opencpop/voc5/local/synthesize.sh
create mode 120000 examples/opencpop/voc5/local/train.sh
create mode 120000 examples/opencpop/voc5/path.sh
create mode 100755 examples/opencpop/voc5/run.sh
create mode 100644 examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
create mode 100755 examples/other/mfa/run_canton.sh
create mode 100644 examples/tal_cs/asr1/README.md
create mode 100644 examples/tal_cs/asr1/RESULTS.md
create mode 100644 examples/tal_cs/asr1/conf/conformer.yaml
create mode 100644 examples/tal_cs/asr1/conf/preprocess.yaml
create mode 100644 examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml
create mode 100644 examples/tal_cs/asr1/conf/tuning/decode.yaml
create mode 100644 examples/tal_cs/asr1/local/data.sh
create mode 100755 examples/tal_cs/asr1/local/test.sh
create mode 100755 examples/tal_cs/asr1/local/test_wav.sh
create mode 100755 examples/tal_cs/asr1/local/train.sh
create mode 100755 examples/tal_cs/asr1/path.sh
create mode 100644 examples/tal_cs/asr1/run.sh
create mode 120000 examples/tal_cs/asr1/utils
mode change 100755 => 120000 examples/vctk/ernie_sat/local/train.sh
mode change 100755 => 120000 examples/vctk/ernie_sat/path.sh
mode change 100755 => 120000 examples/vctk/tts3/local/train.sh
mode change 100755 => 120000 examples/vctk/tts3/path.sh
create mode 100644 examples/vctk/vc3/README.md
create mode 100644 examples/vctk/vc3/conf/default.yaml
create mode 100755 examples/vctk/vc3/local/preprocess.sh
create mode 100755 examples/vctk/vc3/local/train.sh
create mode 100755 examples/vctk/vc3/local/voice_conversion.sh
create mode 100755 examples/vctk/vc3/path.sh
create mode 100755 examples/vctk/vc3/run.sh
mode change 100755 => 120000 examples/vctk/voc1/local/synthesize.sh
mode change 100755 => 120000 examples/vctk/voc1/local/train.sh
mode change 100755 => 120000 examples/vctk/voc1/path.sh
mode change 100755 => 120000 examples/vctk/voc5/local/preprocess.sh
mode change 100755 => 120000 examples/vctk/voc5/local/synthesize.sh
mode change 100755 => 120000 examples/vctk/voc5/local/train.sh
mode change 100755 => 120000 examples/vctk/voc5/path.sh
mode change 100755 => 120000 examples/zh_en_tts/tts3/local/train.sh
mode change 100755 => 120000 examples/zh_en_tts/tts3/path.sh
create mode 100644 paddlespeech/dataset/__init__.py
rename {dataset => paddlespeech/dataset}/aidatatang_200zh/README.md (100%)
create mode 100644 paddlespeech/dataset/aidatatang_200zh/__init__.py
create mode 100644 paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
create mode 100644 paddlespeech/dataset/aishell/README.md
create mode 100644 paddlespeech/dataset/aishell/__init__.py
create mode 100644 paddlespeech/dataset/aishell/aishell.py
rename utils/utility.py => paddlespeech/dataset/download.py (59%)
create mode 100644 paddlespeech/dataset/s2t/__init__.py
create mode 100755 paddlespeech/dataset/s2t/avg_model.py
create mode 100755 paddlespeech/dataset/s2t/build_vocab.py
create mode 100755 paddlespeech/dataset/s2t/compute_mean_std.py
create mode 100755 paddlespeech/dataset/s2t/compute_wer.py
create mode 100755 paddlespeech/dataset/s2t/format_data.py
create mode 100644 paddlespeech/dataset/s2t/format_rsl.py
create mode 100644 paddlespeech/s2t/io/speechbrain/__init__.py
create mode 100755 paddlespeech/s2t/io/speechbrain/batch.py
create mode 100755 paddlespeech/s2t/io/speechbrain/data_pipeline.py
create mode 100755 paddlespeech/s2t/io/speechbrain/data_utils.py
create mode 100755 paddlespeech/s2t/io/speechbrain/dataio.py
create mode 100755 paddlespeech/s2t/io/speechbrain/dataloader.py
create mode 100755 paddlespeech/s2t/io/speechbrain/dataset.py
create mode 100755 paddlespeech/s2t/io/speechbrain/depgraph.py
create mode 100755 paddlespeech/s2t/io/speechbrain/make_dataloader.py
create mode 100755 paddlespeech/s2t/io/speechbrain/sampler.py
create mode 100755 paddlespeech/s2t/io/speechbrain/sb_pipeline.py
mode change 100644 => 100755 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
create mode 100644 paddlespeech/s2t/modules/conv2d.py
create mode 100644 paddlespeech/s2t/modules/time_reduction.py
rename paddlespeech/s2t/training/{optimizer.py => optimizer/__init__.py} (99%)
create mode 100644 paddlespeech/s2t/training/optimizer/adadelta.py
create mode 100644 paddlespeech/t2s/exps/diffsinger/__init__.py
create mode 100644 paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py
create mode 100644 paddlespeech/t2s/exps/diffsinger/get_minmax.py
create mode 100644 paddlespeech/t2s/exps/diffsinger/normalize.py
create mode 100644 paddlespeech/t2s/exps/diffsinger/preprocess.py
create mode 100644 paddlespeech/t2s/exps/diffsinger/train.py
create mode 100644 paddlespeech/t2s/exps/dygraph_to_static.py
create mode 100644 paddlespeech/t2s/exps/jets/__init__.py
create mode 100644 paddlespeech/t2s/exps/jets/inference.py
create mode 100644 paddlespeech/t2s/exps/jets/normalize.py
create mode 100644 paddlespeech/t2s/exps/jets/preprocess.py
create mode 100644 paddlespeech/t2s/exps/jets/synthesize.py
create mode 100644 paddlespeech/t2s/exps/jets/synthesize_e2e.py
create mode 100644 paddlespeech/t2s/exps/jets/train.py
create mode 100644 paddlespeech/t2s/exps/sentences_canton.txt
create mode 100644 paddlespeech/t2s/exps/sentences_sing.txt
create mode 100644 paddlespeech/t2s/exps/starganv2_vc/__init__.py
create mode 100644 paddlespeech/t2s/exps/starganv2_vc/normalize.py
create mode 100644 paddlespeech/t2s/exps/starganv2_vc/preprocess.py
create mode 100644 paddlespeech/t2s/exps/starganv2_vc/train.py
create mode 100644 paddlespeech/t2s/exps/starganv2_vc/vc.py
create mode 100644 paddlespeech/t2s/exps/vits/inference.py
create mode 100644 paddlespeech/t2s/exps/vits/lite_predict.py
create mode 100644 paddlespeech/t2s/frontend/canton_frontend.py
create mode 100644 paddlespeech/t2s/frontend/sing_frontend.py
create mode 100644 paddlespeech/t2s/models/diffsinger/__init__.py
create mode 100644 paddlespeech/t2s/models/diffsinger/diffsinger.py
create mode 100644 paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
create mode 100644 paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
create mode 100644 paddlespeech/t2s/models/jets/__init__.py
create mode 100644 paddlespeech/t2s/models/jets/alignments.py
create mode 100644 paddlespeech/t2s/models/jets/generator.py
create mode 100644 paddlespeech/t2s/models/jets/jets.py
create mode 100644 paddlespeech/t2s/models/jets/jets_updater.py
create mode 100644 paddlespeech/t2s/models/jets/length_regulator.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/__init__.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/config.yml
create mode 100644 paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/JDCNet/__init__.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/__init__.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/losses.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
create mode 100644 paddlespeech/t2s/models/starganv2_vc/transforms.py
create mode 100644 paddlespeech/t2s/modules/diffnet.py
create mode 100644 paddlespeech/t2s/modules/diffusion.py
create mode 100644 paddlespeech/t2s/modules/wavenet_denoiser.py
create mode 100644 paddlespeech/utils/argparse.py
create mode 100644 paddlespeech/utils/initialize.py
diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..33d53d9f5
--- /dev/null
+++ b/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,77 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Racial or political allusions
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at paddlespeech@baidu.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 000000000..1ff473308
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,30 @@
+# 💡 paddlespeech 提交代码须知
+
+### Discussed in https://github.com/PaddlePaddle/PaddleSpeech/discussions/1326
+
+
+
+Originally posted by **yt605155624** January 12, 2022
+1. 写完代码之后可以用我们的 pre-commit 检查一下代码格式,注意只改自己修改的代码的格式即可,其他的代码有可能也被改了格式,不要 add 就好
+```
+pip install pre-commit
+pre-commit run --file 你修改的代码
+```
+2. 提交 commit 中增加必要信息跳过不必要的 CI
+- 提交 asr 相关代码
+```text
+git commit -m "xxxxxx, test=asr"
+```
+- 提交 tts 相关代码
+```text
+git commit -m "xxxxxx, test=tts"
+```
+- 仅修改文档
+```text
+git commit -m "xxxxxx, test=doc"
+```
+注意:
+1. 虽然跳过了 CI,但是还要先排队排到才能跳过,所以非自己方向看到 pending 不要着急 🤣
+2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
+3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`,因为每个 commit 都会触发 CI
+4. 删除 python 环境中已经安装好的 paddlespeech,否则可能会影响 import paddlespeech 的顺序
diff --git a/.github/ISSUE_TEMPLATE/bug-report-tts.md b/.github/ISSUE_TEMPLATE/bug-report-tts.md
index 64b33c32e..e2322c239 100644
--- a/.github/ISSUE_TEMPLATE/bug-report-tts.md
+++ b/.github/ISSUE_TEMPLATE/bug-report-tts.md
@@ -3,7 +3,6 @@ name: "\U0001F41B TTS Bug Report"
about: Create a report to help us improve
title: "[TTS]XXXX"
labels: Bug, T2S
-assignees: yt605155624
---
diff --git a/.github/stale.yml b/.github/stale.yml
index da19b6606..6b0da9b98 100644
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -6,7 +6,8 @@ daysUntilClose: 30
exemptLabels:
- Roadmap
- Bug
- - New Feature
+ - feature request
+ - Tips
# Label to use when marking an issue as stale
staleLabel: Stale
# Comment to post when marking an issue as stale. Set to `false` to disable
@@ -17,4 +18,4 @@ markComment: >
unmarkComment: false
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: >
- This issue is closed. Please re-open if needed.
\ No newline at end of file
+ This issue is closed. Please re-open if needed.
diff --git a/.gitignore b/.gitignore
index 75f56b604..4a0c43312 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
*.egg-info
build
*output/
+.history
audio/dist/
audio/fc_patch/
diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 761edbc01..5a409e062 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
import platform
COPYRIGHT = '''
-Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -128,4 +128,4 @@ def main(argv=None):
if __name__ == '__main__':
- exit(main())
\ No newline at end of file
+ exit(main())
diff --git a/README.md b/README.md
index 0a12ec049..9ed823116 100644
--- a/README.md
+++ b/README.md
@@ -97,26 +97,47 @@
- Life was like a box of chocolates, you never know what you're gonna get. |
+ Life was like a box of chocolates, you never know what you're gonna get. |
|
- 早上好,今天是2020/10/29,最低温度是-3°C。 |
+ 早上好,今天是2020/10/29,最低温度是-3°C。 |
|
- 季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 |
+ 季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 |
|
+
+ 大家好,我是 parrot 虚拟老师,我们来读一首诗,我与春风皆过客,I and the spring breeze are passing by,你携秋水揽星河,you take the autumn water to take the galaxy。 |
+
+
+
+ |
+
+
+ 宜家唔系事必要你讲,但系你所讲嘅说话将会变成呈堂证供。 |
+
+
+
+ |
+
+
+ 各个国家有各个国家嘅国歌 |
+
+
+
+ |
+
@@ -157,16 +178,24 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
### Recent Update
-- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
-- 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
+- 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server).
+- 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
+- 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
+- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux).
+- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3).
+- 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3).
+- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition).
+- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/).
+- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](./examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
+- 🎉 2022.11.30: Add [TTS Android Demo](./demos/TTSAndroid).
- 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
of paddlepaddle](https://www.paddlepaddle.org.cn/models).
- 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
-- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), Support ASR and Feature Extraction.
+- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](./demos/speech_ssl), Support ASR and Feature Extraction.
- 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/runtime/examples/u2pp_ol/wenetspeech).
- 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
-- 🔥 2022.10.26: Add [Prosody Prediction](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy) for TTS.
+- 🔥 2022.10.26: Add [Prosody Prediction](./examples/other/rhy) for TTS.
- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
- 👑 2022.10.11: Add [Wav2vec2ASR-en](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech.
- 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and [ERNIE-SAT](https://arxiv.org/abs/2211.03545) in [PaddleSpeech Web Demo](./demos/speech_web).
@@ -180,16 +209,16 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 🎉 2022.06.22: All TTS models support ONNX format.
- 🍀 2022.06.17: Add [PaddleSpeech Web Demo](./demos/speech_web).
- 👑 2022.05.13: Release [PP-ASR](./docs/source/asr/PPASR.md)、[PP-TTS](./docs/source/tts/PPTTS.md)、[PP-VPR](docs/source/vpr/PPVPR.md).
-- 👏🏻 2022.05.06: `PaddleSpeech Streaming Server` is available for `Streaming ASR` with `Punctuation Restoration` and `Token Timestamp` and `Text-to-Speech`.
-- 👏🏻 2022.05.06: `PaddleSpeech Server` is available for `Audio Classification`, `Automatic Speech Recognition` and `Text-to-Speech`, `Speaker Verification` and `Punctuation Restoration`.
-- 👏🏻 2022.03.28: `PaddleSpeech CLI` is available for `Speaker Verification`.
-- 👏🏻 2021.12.10: `PaddleSpeech CLI` is available for `Audio Classification`, `Automatic Speech Recognition`, `Speech Translation (English to Chinese)` and `Text-to-Speech`.
+- 👏🏻 2022.05.06: `PaddleSpeech Streaming Server` is available for `Streaming ASR` with `Punctuation Restoration` and `Token Timestamp` and `Text-to-Speech`.
+- 👏🏻 2022.05.06: `PaddleSpeech Server` is available for `Audio Classification`, `Automatic Speech Recognition` and `Text-to-Speech`, `Speaker Verification` and `Punctuation Restoration`.
+- 👏🏻 2022.03.28: `PaddleSpeech CLI` is available for `Speaker Verification`.
+- 👏🏻 2021.12.10: `PaddleSpeech CLI` is available for `Audio Classification`, `Automatic Speech Recognition`, `Speech Translation (English to Chinese)` and `Text-to-Speech`.
### Community
- Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
-
+
## Installation
@@ -550,14 +579,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
- Text Frontend |
- |
-
- tn / g2p
- |
+ Text Frontend |
+ |
+
+ tn / g2p
+ |
- Acoustic Model |
+ Acoustic Model |
Tacotron2 |
LJSpeech / CSMSC |
@@ -592,6 +621,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en
|
+
+ DiffSinger |
+ Opencpop |
+
+ DiffSinger-opencpop
+ |
+
Vocoder |
WaveFlow |
@@ -602,9 +638,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
Parallel WaveGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3
+ PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 / PWGAN-opencpop
|
@@ -623,9 +659,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
HiFiGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3
+ HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 / HiFiGAN-opencpop
|
@@ -985,10 +1021,16 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
- Many thanks to [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) for developing a rasa chatbot,which is able to speak and listen thanks to PaddleSpeech.
- Many thanks to [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) for the C++ inference implementation of PaddleSpeech ASR.
- Many thanks to [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) for the real-time voice typing tool implementation of PaddleSpeech ASR streaming services.
-
+- Many thanks to [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) for the python3.9 prebuilt wheel for PaddleSpeech installation in Windows without Viusal Studio.
Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
+- Many thanks to [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) for converting audio to text based on FastAPI and PaddleSpeech.
+- Many thanks to [MistEO](https://github.com/MistEO)/[Pallas-Bot](https://github.com/MistEO/Pallas-Bot) for QQ bot based on PaddleSpeech TTS.
## License
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
+
+## Stargazers over time
+
+[![Stargazers over time](https://starchart.cc/PaddlePaddle/PaddleSpeech.svg)](https://starchart.cc/PaddlePaddle/PaddleSpeech)
diff --git a/README_cn.md b/README_cn.md
index 5cc156c9f..8b98b61ce 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -122,6 +122,27 @@
+
+ 大家好,我是 parrot 虚拟老师,我们来读一首诗,我与春风皆过客,I and the spring breeze are passing by,你携秋水揽星河,you take the autumn water to take the galaxy。 |
+
+
+
+ |
+
+
+ 宜家唔系事必要你讲,但系你所讲嘅说话将会变成呈堂证供。 |
+
+
+
+ |
+
+
+ 各个国家有各个国家嘅国歌 |
+
+
+
+ |
+
@@ -161,18 +182,24 @@
- 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。
- 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。
-
-
### 近期更新
-- 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
-- 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
+- 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。
+- 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例,包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5),效果持续优化中。
+- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
+- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。
+- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。
+- 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。
+- 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。
+- 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。
+- 🎉 2022.12.02: 新增[端到端韵律预测全流程](./examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
+- 🎉 2022.11.30: 新增 [TTS Android 部署示例](./demos/TTSAndroid)。
- 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验!
- 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), 支持多种语言的识别与翻译。
-- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), 支持 ASR 和 特征提取.
+- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](./demos/speech_ssl), 支持 ASR 和特征提取。
- 🎉 2022.11.17: TTS 新增[高质量男性音色](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660)。
-- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech)。
+- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](./speechx/examples/u2pp_ol/wenetspeech)。
- 👑 2022.11.01: [中英文混合 TTS](./examples/zh_en_tts/tts3) 新增 [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) 模块。
-- 🔥 2022.10.26: TTS 新增[韵律预测](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy)功能。
+- 🔥 2022.10.26: TTS 新增[韵律预测](./develop/examples/other/rhy)功能。
- 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
- 👑 2022.10.11: 新增 [Wav2vec2ASR-en](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。
- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 到 [PaddleSpeech 网页应用](./demos/speech_web)。
@@ -200,7 +227,7 @@
微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
-
+
@@ -551,43 +578,50 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
tn / g2p
|
-
-
- 声学模型 |
+
+
+ 声学模型 |
Tacotron2 |
LJSpeech / CSMSC |
tacotron2-ljspeech / tacotron2-csmsc
|
-
-
+
+
Transformer TTS |
LJSpeech |
transformer-ljspeech
|
-
-
+
+
SpeedySpeech |
CSMSC |
speedyspeech-csmsc
|
-
-
+
+
FastSpeech2 |
LJSpeech / VCTK / CSMSC / AISHELL-3 / ZH_EN / finetune |
fastspeech2-ljspeech / fastspeech2-vctk / fastspeech2-csmsc / fastspeech2-aishell3 / fastspeech2-zh_en / fastspeech2-finetune
|
-
-
+
+
ERNIE-SAT |
VCTK / AISHELL-3 / ZH_EN |
ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en
|
-
+
+
+ DiffSinger |
+ Opencpop |
+
+ DiffSinger-opencpop
+ |
+
声码器 |
WaveFlow |
@@ -598,9 +632,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
Parallel WaveGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3
+ PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3 / PWGAN-opencpop
|
@@ -619,9 +653,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
HiFiGAN |
- LJSpeech / VCTK / CSMSC / AISHELL-3 |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop |
- HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3
+ HiFiGAN-ljspeech / HiFiGAN-vctk / HiFiGAN-csmsc / HiFiGAN-aishell3 / HiFiGAN-opencpop
|
@@ -678,6 +712,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
+
**声音分类**
@@ -986,13 +1021,19 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
- 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。
- 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。
- 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。
-
- 非常感谢 [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) 基于 PaddleSpeech 的 ASR 与 TTS 设计的可听、说对话机器人。
- 非常感谢 [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) 对 PaddleSpeech 的 ASR 进行 C++ 推理实现。
- 非常感谢 [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) 基于 PaddleSpeech 的 ASR 流式服务实现的实时语音输入法工具。
+- 非常感谢 [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) 对PaddleSpeech在Windows下的安装提供了无需Visua Studio,基于python3.9的预编译依赖安装包。
+- 非常感谢 [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) 利用 FastAPI 实现 PaddleSpeech 语音转文字,文件上传、分割、转换进度显示、后台更新任务并以 csv 格式输出。
+- 非常感谢 [MistEO](https://github.com/MistEO)/[Pallas-Bot](https://github.com/MistEO/Pallas-Bot) 基于 PaddleSpeech TTS 的 QQ Bot 项目。
此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。
## License
PaddleSpeech 在 [Apache-2.0 许可](./LICENSE) 下提供。
+
+## Stargazers over time
+
+[![Stargazers over time](https://starchart.cc/PaddlePaddle/PaddleSpeech.svg)](https://starchart.cc/PaddlePaddle/PaddleSpeech)
diff --git a/audio/CMakeLists.txt b/audio/CMakeLists.txt
index d9ae63cd2..021e24477 100644
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
@@ -41,24 +41,18 @@ option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
# cmake
set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
-if (NOT MSVC)
- find_package(GFortranLibs REQUIRED)
- include(FortranCInterface)
- include(FindGFortranLibs REQUIRED)
-endif()
-
# fc_patch dir
set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch})
set(THIRD_PARTY_PATH ${fc_patch})
-include(openblas)
-
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
include(cmake/pybind.cmake)
include_directories(${PYTHON_INCLUDE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/paddleaudio/third_party/)
+
# packages
find_package(Python3 COMPONENTS Interpreter Development)
diff --git a/audio/README.md b/audio/README.md
index bfd8625f0..d42d41229 100644
--- a/audio/README.md
+++ b/audio/README.md
@@ -2,33 +2,22 @@
安装方式: pip install paddleaudio
-目前支持的平台:Linux:
+目前支持的平台:Linux, Mac, Windows
## Environment
## Build wheel
+cmd: python setup.py bdist_wheel
Linux test build whl environment:
-* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2`
* os - Ubuntu 16.04.7 LTS
-* gcc/g++/gfortran - 8.2.0
+* gcc/g++ - 8.2.0
* cmake - 3.18.0 (need install)
-* [How to Install Docker](https://docs.docker.com/engine/install/)
-* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
-
-1. First to launch docker container.
-
-```
-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash
-```
-2. python setup.py bdist_wheel
-
MAC:test build whl envrioment:
* os
-* gcc/g++/gfortran 12.2.0
+* gcc/g++ 12.2.0
* cpu Intel Xeon E5 x86_64
Windows:
-not support: paddleaudio C++ extension lib (sox io, kaldi native fbank)
-python setup.py bdist_wheel
+not support paddleaudio C++ extension lib (sox io, kaldi native fbank)
diff --git a/audio/paddleaudio/CMakeLists.txt b/audio/paddleaudio/CMakeLists.txt
index dbf2bd3eb..c6b43c780 100644
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
@@ -1,19 +1,3 @@
add_subdirectory(third_party)
add_subdirectory(src)
-
-if (APPLE)
- file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
- DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
- file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
- DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-
- file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
- DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-
- file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
- DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-endif()
diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py
index 7b3230de9..becd23cd8 100644
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@@ -67,8 +67,11 @@ def deprecated(direction: str, version: Optional[str]=None):
def is_kaldi_available():
- return is_module_available("paddleaudio._paddleaudio")
-
+ try:
+ from paddleaudio import _paddleaudio
+ return True
+ except Exception:
+ return False
def requires_kaldi():
if is_kaldi_available():
@@ -128,9 +131,11 @@ def requires_soundfile():
def is_sox_available():
- if platform.system() == "Windows": # not support sox in windows
+ try:
+ from paddleaudio import _paddleaudio
+ return True
+ except Exception:
return False
- return is_module_available("paddleaudio._paddleaudio")
def requires_sox():
diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py
index ae7b5b52d..9195ea097 100644
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
if sr <= 0:
raise ParameterError(
- f'Sample rate should be larger than 0, recieved sr = {sr}')
+ f'Sample rate should be larger than 0, received sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
diff --git a/audio/paddleaudio/kaldi/__init__.py b/audio/paddleaudio/kaldi/__init__.py
index f951e280a..a0ae644d1 100644
--- a/audio/paddleaudio/kaldi/__init__.py
+++ b/audio/paddleaudio/kaldi/__init__.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .kaldi import fbank
-from .kaldi import pitch
+#from .kaldi import pitch
diff --git a/audio/paddleaudio/kaldi/kaldi.py b/audio/paddleaudio/kaldi/kaldi.py
index 16969d772..0f080de04 100644
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
@@ -16,7 +16,6 @@ from paddleaudio._internal import module_utils
__all__ = [
'fbank',
- 'pitch',
]
@@ -33,8 +32,6 @@ def fbank(
round_to_power_of_two: bool=True,
blackman_coeff: float=0.42,
snip_edges: bool=True,
- allow_downsample: bool=False,
- allow_upsample: bool=False,
max_feature_vectors: int=-1,
num_bins: int=23,
low_freq: float=20,
@@ -62,8 +59,6 @@ def fbank(
frame_opts.round_to_power_of_two = round_to_power_of_two
frame_opts.blackman_coeff = blackman_coeff
frame_opts.snip_edges = snip_edges
- frame_opts.allow_downsample = allow_downsample
- frame_opts.allow_upsample = allow_upsample
frame_opts.max_feature_vectors = max_feature_vectors
mel_opts.num_bins = num_bins
@@ -85,48 +80,48 @@ def fbank(
return feat
-@module_utils.requires_kaldi()
-def pitch(wav,
- samp_freq: int=16000,
- frame_shift_ms: float=10.0,
- frame_length_ms: float=25.0,
- preemph_coeff: float=0.0,
- min_f0: int=50,
- max_f0: int=400,
- soft_min_f0: float=10.0,
- penalty_factor: float=0.1,
- lowpass_cutoff: int=1000,
- resample_freq: int=4000,
- delta_pitch: float=0.005,
- nccf_ballast: int=7000,
- lowpass_filter_width: int=1,
- upsample_filter_width: int=5,
- max_frames_latency: int=0,
- frames_per_chunk: int=0,
- simulate_first_pass_online: bool=False,
- recompute_frame: int=500,
- nccf_ballast_online: bool=False,
- snip_edges: bool=True):
- pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
- pitch_opts.samp_freq = samp_freq
- pitch_opts.frame_shift_ms = frame_shift_ms
- pitch_opts.frame_length_ms = frame_length_ms
- pitch_opts.preemph_coeff = preemph_coeff
- pitch_opts.min_f0 = min_f0
- pitch_opts.max_f0 = max_f0
- pitch_opts.soft_min_f0 = soft_min_f0
- pitch_opts.penalty_factor = penalty_factor
- pitch_opts.lowpass_cutoff = lowpass_cutoff
- pitch_opts.resample_freq = resample_freq
- pitch_opts.delta_pitch = delta_pitch
- pitch_opts.nccf_ballast = nccf_ballast
- pitch_opts.lowpass_filter_width = lowpass_filter_width
- pitch_opts.upsample_filter_width = upsample_filter_width
- pitch_opts.max_frames_latency = max_frames_latency
- pitch_opts.frames_per_chunk = frames_per_chunk
- pitch_opts.simulate_first_pass_online = simulate_first_pass_online
- pitch_opts.recompute_frame = recompute_frame
- pitch_opts.nccf_ballast_online = nccf_ballast_online
- pitch_opts.snip_edges = snip_edges
- pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
- return pitch
+#@module_utils.requires_kaldi()
+#def pitch(wav,
+#samp_freq: int=16000,
+#frame_shift_ms: float=10.0,
+#frame_length_ms: float=25.0,
+#preemph_coeff: float=0.0,
+#min_f0: int=50,
+#max_f0: int=400,
+#soft_min_f0: float=10.0,
+#penalty_factor: float=0.1,
+#lowpass_cutoff: int=1000,
+#resample_freq: int=4000,
+#delta_pitch: float=0.005,
+#nccf_ballast: int=7000,
+#lowpass_filter_width: int=1,
+#upsample_filter_width: int=5,
+#max_frames_latency: int=0,
+#frames_per_chunk: int=0,
+#simulate_first_pass_online: bool=False,
+#recompute_frame: int=500,
+#nccf_ballast_online: bool=False,
+#snip_edges: bool=True):
+#pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+#pitch_opts.samp_freq = samp_freq
+#pitch_opts.frame_shift_ms = frame_shift_ms
+#pitch_opts.frame_length_ms = frame_length_ms
+#pitch_opts.preemph_coeff = preemph_coeff
+#pitch_opts.min_f0 = min_f0
+#pitch_opts.max_f0 = max_f0
+#pitch_opts.soft_min_f0 = soft_min_f0
+#pitch_opts.penalty_factor = penalty_factor
+#pitch_opts.lowpass_cutoff = lowpass_cutoff
+#pitch_opts.resample_freq = resample_freq
+#pitch_opts.delta_pitch = delta_pitch
+#pitch_opts.nccf_ballast = nccf_ballast
+#pitch_opts.lowpass_filter_width = lowpass_filter_width
+#pitch_opts.upsample_filter_width = upsample_filter_width
+#pitch_opts.max_frames_latency = max_frames_latency
+#pitch_opts.frames_per_chunk = frames_per_chunk
+#pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+#pitch_opts.recompute_frame = recompute_frame
+#pitch_opts.nccf_ballast_online = nccf_ballast_online
+#pitch_opts.snip_edges = snip_edges
+#pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+#return pitch
diff --git a/audio/paddleaudio/src/CMakeLists.txt b/audio/paddleaudio/src/CMakeLists.txt
index fb6f32092..21e0f170d 100644
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
@@ -52,7 +52,7 @@ if(BUILD_KALDI)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
- libkaldi
+ kaldi-native-fbank-core
)
list(
APPEND
@@ -92,14 +92,6 @@ define_library(
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
-if (APPLE)
- add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
- set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
-endif()
-
if (APPLE)
set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
else()
@@ -207,11 +199,3 @@ define_extension(
# )
# endif()
endif()
-
-if (APPLE)
- add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
- set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
-endif()
diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common.h b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
index 05522bb7e..6571fa3eb 100644
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
@@ -16,7 +16,7 @@
#include "pybind11/pybind11.h"
#include "pybind11/numpy.h"
-#include "feat/feature-window.h"
+#include "kaldi-native-fbank/csrc/feature-window.h"
namespace paddleaudio {
namespace kaldi {
@@ -28,18 +28,18 @@ class StreamingFeatureTpl {
public:
typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts);
- bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
- ::kaldi::Vector<::kaldi::BaseFloat>* feats);
- void Reset() { remained_wav_.Resize(0); }
+ bool ComputeFeature(const std::vector& wav,
+ std::vector* feats);
+ void Reset() { remained_wav_.resize(0); }
int Dim() { return computer_.Dim(); }
private:
- bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
- ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+ bool Compute(const std::vector& waves,
+ std::vector* feats);
Options opts_;
- ::kaldi::FeatureWindowFunction window_function_;
- ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+ knf::FeatureWindowFunction window_function_;
+ std::vector remained_wav_;
F computer_;
};
diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
index c894b9775..985d586fe 100644
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "base/kaldi-common.h"
namespace paddleaudio {
namespace kaldi {
@@ -25,24 +24,29 @@ StreamingFeatureTpl::StreamingFeatureTpl(const Options& opts)
template
bool StreamingFeatureTpl::ComputeFeature(
- const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
- ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+ const std::vector& wav,
+ std::vector* feats) {
// append remaned waves
- ::kaldi::int32 wav_len = wav.Dim();
+ int wav_len = wav.size();
if (wav_len == 0) return false;
- ::kaldi::int32 left_len = remained_wav_.Dim();
- ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
- waves.Range(0, left_len).CopyFromVec(remained_wav_);
- waves.Range(left_len, wav_len).CopyFromVec(wav);
+ int left_len = remained_wav_.size();
+ std::vector waves(left_len + wav_len);
+ std::memcpy(waves.data(),
+ remained_wav_.data(),
+ left_len * sizeof(float));
+ std::memcpy(waves.data() + left_len,
+ wav.data(),
+ wav_len * sizeof(float));
// cache remaned waves
- ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
- ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
- ::kaldi::int32 frame_shift = frame_opts.WindowShift();
- ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
- remained_wav_.Resize(left_samples);
- remained_wav_.CopyFromVec(
- waves.Range(frame_shift * num_frames, left_samples));
+ knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+ int num_frames = knf::NumFrames(waves.size(), frame_opts);
+ int frame_shift = frame_opts.WindowShift();
+ int left_samples = waves.size() - frame_shift * num_frames;
+ remained_wav_.resize(left_samples);
+ std::memcpy(remained_wav_.data(),
+ waves.data() + frame_shift * num_frames,
+ left_samples * sizeof(float));
// compute speech feature
Compute(waves, feats);
@@ -51,40 +55,39 @@ bool StreamingFeatureTpl::ComputeFeature(
// Compute feat
template
-bool StreamingFeatureTpl::Compute(
- const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
- ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
- ::kaldi::BaseFloat vtln_warp = 1.0;
- const ::kaldi::FrameExtractionOptions& frame_opts =
- computer_.GetFrameOptions();
- ::kaldi::int32 num_samples = waves.Dim();
- ::kaldi::int32 frame_length = frame_opts.WindowSize();
- ::kaldi::int32 sample_rate = frame_opts.samp_freq;
+bool StreamingFeatureTpl::Compute(const std::vector& waves,
+ std::vector* feats) {
+ const knf::FrameExtractionOptions& frame_opts = computer_.GetFrameOptions();
+ int num_samples = waves.size();
+ int frame_length = frame_opts.WindowSize();
+ int sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
- return false;
+ return true;
}
- ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
- feats->Resize(num_frames * Dim());
+ int num_frames = knf::NumFrames(num_samples, frame_opts);
+ feats->resize(num_frames * Dim());
- ::kaldi::Vector<::kaldi::BaseFloat> window;
+ std::vector window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
- for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
- ::kaldi::BaseFloat raw_log_energy = 0.0;
- ::kaldi::ExtractWindow(0,
- waves,
- frame,
- frame_opts,
- window_function_,
- &window,
- need_raw_log_energy ? &raw_log_energy : NULL);
+ for (int frame = 0; frame < num_frames; frame++) {
+ std::fill(window.begin(), window.end(), 0);
+ float raw_log_energy = 0.0;
+ float vtln_warp = 1.0;
+ knf::ExtractWindow(0,
+ waves,
+ frame,
+ frame_opts,
+ window_function_,
+ &window,
+ need_raw_log_energy ? &raw_log_energy : NULL);
- ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
- ::kaldi::kUndefined);
- computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
- ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
- feats->Data() + frame * Dim(), Dim());
- output_row.CopyFromVec(this_feature);
+ std::vector this_feature(computer_.Dim());
+ computer_.Compute(
+ raw_log_energy, vtln_warp, &window, this_feature.data());
+ std::memcpy(feats->data() + frame * Dim(),
+ this_feature.data(),
+ sizeof(float) * Dim());
}
return true;
}
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
index 40e3786e8..83df454c5 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
@@ -13,16 +13,16 @@
// limitations under the License.
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"
namespace paddleaudio {
namespace kaldi {
bool InitFbank(
- ::kaldi::FrameExtractionOptions frame_opts,
- ::kaldi::MelBanksOptions mel_opts,
+ knf::FrameExtractionOptions frame_opts,
+ knf::MelBanksOptions mel_opts,
FbankOptions fbank_opts) {
- ::kaldi::FbankOptions opts;
+ knf::FbankOptions opts;
opts.frame_opts = frame_opts;
opts.mel_opts = mel_opts;
opts.use_energy = fbank_opts.use_energy;
@@ -41,8 +41,8 @@ py::array_t ComputeFbankStreaming(const py::array_t& wav) {
}
py::array_t ComputeFbank(
- ::kaldi::FrameExtractionOptions frame_opts,
- ::kaldi::MelBanksOptions mel_opts,
+ knf::FrameExtractionOptions frame_opts,
+ knf::MelBanksOptions mel_opts,
FbankOptions fbank_opts,
const py::array_t& wav) {
InitFbank(frame_opts, mel_opts, fbank_opts);
@@ -55,21 +55,21 @@ void ResetFbank() {
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
}
-py::array_t ComputeKaldiPitch(
- const ::kaldi::PitchExtractionOptions& opts,
- const py::array_t& wav) {
- py::buffer_info info = wav.request();
- ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+//py::array_t ComputeKaldiPitch(
+ //const ::kaldi::PitchExtractionOptions& opts,
+ //const py::array_t& wav) {
+ //py::buffer_info info = wav.request();
+ //::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
- ::kaldi::Matrix<::kaldi::BaseFloat> features;
- ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
- auto result = py::array_t({features.NumRows(), features.NumCols()});
- for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
- std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
- sizeof(float)*features.NumCols());
- }
- return result;
-}
+ //::kaldi::Matrix<::kaldi::BaseFloat> features;
+ //::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+ //auto result = py::array_t({features.NumRows(), features.NumCols()});
+ //for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+ //std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+ //sizeof(float)*features.NumCols());
+ //}
+ //return result;
+//}
} // namespace kaldi
} // namespace paddleaudio
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
index e059c52c1..031ec863b 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
@@ -19,7 +19,7 @@
#include
#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"
namespace py = pybind11;
@@ -42,13 +42,13 @@ struct FbankOptions{
};
bool InitFbank(
- ::kaldi::FrameExtractionOptions frame_opts,
- ::kaldi::MelBanksOptions mel_opts,
+ knf::FrameExtractionOptions frame_opts,
+ knf::MelBanksOptions mel_opts,
FbankOptions fbank_opts);
py::array_t ComputeFbank(
- ::kaldi::FrameExtractionOptions frame_opts,
- ::kaldi::MelBanksOptions mel_opts,
+ knf::FrameExtractionOptions frame_opts,
+ knf::MelBanksOptions mel_opts,
FbankOptions fbank_opts,
const py::array_t& wav);
@@ -56,9 +56,9 @@ py::array_t ComputeFbankStreaming(const py::array_t& wav);
void ResetFbank();
-py::array_t ComputeKaldiPitch(
- const ::kaldi::PitchExtractionOptions& opts,
- const py::array_t& wav);
+//py::array_t ComputeKaldiPitch(
+ //const ::kaldi::PitchExtractionOptions& opts,
+ //const py::array_t& wav);
} // namespace kaldi
} // namespace paddleaudio
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
index 79558046b..8b8ff18be 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@@ -22,7 +22,7 @@ KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
return &instance;
}
-bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+bool KaldiFeatureWrapper::InitFbank(knf::FbankOptions opts) {
fbank_.reset(new Fbank(opts));
return true;
}
@@ -30,21 +30,18 @@ bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
py::array_t KaldiFeatureWrapper::ComputeFbank(
const py::array_t wav) {
py::buffer_info info = wav.request();
- ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+ std::vector input_wav((float*)info.ptr, (float*)info.ptr + info.size);
- ::kaldi::Vector<::kaldi::BaseFloat> feats;
+ std::vector feats;
bool flag = fbank_->ComputeFeature(input_wav, &feats);
- if (flag == false || feats.Dim() == 0) return py::array_t();
- auto result = py::array_t(feats.Dim());
+ if (flag == false || feats.size() == 0) return py::array_t();
+ auto result = py::array_t(feats.size());
py::buffer_info xs = result.request();
- std::cout << std::endl;
float* res_ptr = (float*)xs.ptr;
- for (int idx = 0; idx < feats.Dim(); ++idx) {
- *res_ptr = feats(idx);
- res_ptr++;
- }
-
- return result.reshape({feats.Dim() / Dim(), Dim()});
+ std::memcpy(res_ptr, feats.data(), sizeof(float)*feats.size());
+ std::vector shape{static_cast(feats.size() / Dim()),
+ static_cast(Dim())};
+ return result.reshape(shape);
}
} // namesapce kaldi
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
index bee1eee02..daad2d587 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
@@ -14,20 +14,18 @@
#pragma once
-#include "base/kaldi-common.h"
-#include "feat/feature-fbank.h"
-
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
#include "paddleaudio/src/pybind/kaldi/feature_common.h"
namespace paddleaudio {
namespace kaldi {
-typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+typedef StreamingFeatureTpl Fbank;
class KaldiFeatureWrapper {
public:
static KaldiFeatureWrapper* GetInstance();
- bool InitFbank(::kaldi::FbankOptions opts);
+ bool InitFbank(knf::FbankOptions opts);
py::array_t ComputeFbank(const py::array_t wav);
int Dim() { return fbank_->Dim(); }
void ResetFbank() { fbank_->Reset(); }
diff --git a/audio/paddleaudio/src/pybind/pybind.cpp b/audio/paddleaudio/src/pybind/pybind.cpp
index 692e80995..510712034 100644
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
@@ -2,7 +2,7 @@
#ifdef INCLUDE_KALDI
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
#endif
#ifdef INCLUDE_SOX
@@ -89,53 +89,51 @@ PYBIND11_MODULE(_paddleaudio, m) {
#ifdef INCLUDE_KALDI
m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
- py::class_(m, "PitchExtractionOptions")
- .def(py::init<>())
- .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
- .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
- .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
- .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
- .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
- .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
- .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
- .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
- .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
- .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
- .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
- .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
- .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
- .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
- .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
- .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
- .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
- .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
- .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
- .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
- m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
- py::class_(m, "FrameExtractionOptions")
+ //py::class_(m, "PitchExtractionOptions")
+ //.def(py::init<>())
+ //.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+ //.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+ //.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+ //.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+ //.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+ //.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+ //.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+ //.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+ //.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+ //.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+ //.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+ //.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+ //.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
+ //.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
+ //.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
+ //.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
+ //.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
+ //.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
+ //.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
+ //.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
+ //m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
+ py::class_(m, "FrameExtractionOptions")
.def(py::init<>())
- .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
- .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)
- .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
- .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)
- .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)
- .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)
- .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
- .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)
- .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)
- .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
- .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
- .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
- .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
- py::class_(m, "MelBanksOptions")
+ .def_readwrite("samp_freq", &knf::FrameExtractionOptions::samp_freq)
+ .def_readwrite("frame_shift_ms", &knf::FrameExtractionOptions::frame_shift_ms)
+ .def_readwrite("frame_length_ms", &knf::FrameExtractionOptions::frame_length_ms)
+ .def_readwrite("dither", &knf::FrameExtractionOptions::dither)
+ .def_readwrite("preemph_coeff", &knf::FrameExtractionOptions::preemph_coeff)
+ .def_readwrite("remove_dc_offset", &knf::FrameExtractionOptions::remove_dc_offset)
+ .def_readwrite("window_type", &knf::FrameExtractionOptions::window_type)
+ .def_readwrite("round_to_power_of_two", &knf::FrameExtractionOptions::round_to_power_of_two)
+ .def_readwrite("blackman_coeff", &knf::FrameExtractionOptions::blackman_coeff)
+ .def_readwrite("snip_edges", &knf::FrameExtractionOptions::snip_edges)
+ .def_readwrite("max_feature_vectors", &knf::FrameExtractionOptions::max_feature_vectors);
+ py::class_(m, "MelBanksOptions")
.def(py::init<>())
- .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
- .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
- .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
- .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
- .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
- .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
- .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
+ .def_readwrite("num_bins", &knf::MelBanksOptions::num_bins)
+ .def_readwrite("low_freq", &knf::MelBanksOptions::low_freq)
+ .def_readwrite("high_freq", &knf::MelBanksOptions::high_freq)
+ .def_readwrite("vtln_low", &knf::MelBanksOptions::vtln_low)
+ .def_readwrite("vtln_high", &knf::MelBanksOptions::vtln_high)
+ .def_readwrite("debug_mel", &knf::MelBanksOptions::debug_mel)
+ .def_readwrite("htk_mode", &knf::MelBanksOptions::htk_mode);
py::class_(m, "FbankOptions")
.def(py::init<>())
diff --git a/audio/paddleaudio/third_party/CMakeLists.txt b/audio/paddleaudio/third_party/CMakeLists.txt
index 43288f39b..4b85bada0 100644
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
@@ -11,5 +11,6 @@ endif()
# kaldi
################################################################################
if (BUILD_KALDI)
- add_subdirectory(kaldi)
-endif()
\ No newline at end of file
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+ add_subdirectory(kaldi-native-fbank/csrc)
+endif()
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
new file mode 100644
index 000000000..176607fc0
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
@@ -0,0 +1,22 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../)
+add_library(kaldi-native-fbank-core
+ feature-fbank.cc
+ feature-functions.cc
+ feature-window.cc
+ fftsg.c
+ log.cc
+ mel-computations.cc
+ rfft.cc
+)
+# We are using std::call_once() in log.h,which requires us to link with -pthread
+if(NOT WIN32)
+ target_link_libraries(kaldi-native-fbank-core -pthread)
+endif()
+
+if(KNF_HAVE_EXECINFO_H)
+ target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
+endif()
+
+if(KNF_HAVE_CXXABI_H)
+ target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
+endif()
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
new file mode 100644
index 000000000..740ee17e9
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
+//
+#include "kaldi-native-fbank/csrc/feature-fbank.h"
+
+#include
+
+#include "kaldi-native-fbank/csrc/feature-functions.h"
+
+namespace knf {
+
+static void Sqrt(float *in_out, int32_t n) {
+ for (int32_t i = 0; i != n; ++i) {
+ in_out[i] = std::sqrt(in_out[i]);
+ }
+}
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
+ os << opts.ToString();
+ return os;
+}
+
+FbankComputer::FbankComputer(const FbankOptions &opts)
+ : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
+ if (opts.energy_floor > 0.0f) {
+ log_energy_floor_ = logf(opts.energy_floor);
+ }
+
+ // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
+ // [note: this call caches it.]
+ GetMelBanks(1.0f);
+}
+
+FbankComputer::~FbankComputer() {
+ for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
+ delete iter->second;
+}
+
+const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
+ MelBanks *this_mel_banks = nullptr;
+
+ // std::map::iterator iter = mel_banks_.find(vtln_warp);
+ auto iter = mel_banks_.find(vtln_warp);
+ if (iter == mel_banks_.end()) {
+ this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
+ mel_banks_[vtln_warp] = this_mel_banks;
+ } else {
+ this_mel_banks = iter->second;
+ }
+ return this_mel_banks;
+}
+
+void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
+ std::vector *signal_frame, float *feature) {
+ const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
+
+ KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
+
+ // Compute energy after window function (not the raw one).
+ if (opts_.use_energy && !opts_.raw_energy) {
+ signal_raw_log_energy = std::log(
+ std::max(InnerProduct(signal_frame->data(), signal_frame->data(),
+ signal_frame->size()),
+ std::numeric_limits::epsilon()));
+ }
+ rfft_.Compute(signal_frame->data()); // signal_frame is modified in-place
+ ComputePowerSpectrum(signal_frame);
+
+ // Use magnitude instead of power if requested.
+ if (!opts_.use_power) {
+ Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
+ }
+
+ int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+
+ // Its length is opts_.mel_opts.num_bins
+ float *mel_energies = feature + mel_offset;
+
+ // Sum with mel filter banks over the power spectrum
+ mel_banks.Compute(signal_frame->data(), mel_energies);
+
+ if (opts_.use_log_fbank) {
+ // Avoid log of zero (which should be prevented anyway by dithering).
+ for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
+ auto t = std::max(mel_energies[i], std::numeric_limits::epsilon());
+ mel_energies[i] = std::log(t);
+ }
+ }
+
+ // Copy energy as first value (or the last, if htk_compat == true).
+ if (opts_.use_energy) {
+ if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
+ signal_raw_log_energy = log_energy_floor_;
+ }
+ int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
+ feature[energy_index] = signal_raw_log_energy;
+ }
+}
+
+} // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
new file mode 100644
index 000000000..0ef3fac0d
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.h
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+
+#include