From e66d1b7d9655bb44a588f0e1c1269f07bc4c6dd9 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sun, 18 Sep 2022 12:43:55 +0800 Subject: [PATCH] update audio api in app --- audio/docs/Makefile | 19 + audio/docs/README.md | 24 + audio/docs/images/paddle.png | Bin 0 -> 5043 bytes audio/docs/make.bat | 35 + audio/docs/source/_static/custom.css | 5 + audio/docs/source/_templates/module.rst_t | 9 + audio/docs/source/_templates/package.rst_t | 57 ++ audio/docs/source/_templates/toc.rst_t | 8 + audio/docs/source/conf.py | 181 ++++ audio/docs/source/index.rst | 22 + .../source/paddleaudio.backends.common.rst | 7 + .../paddleaudio.backends.no_backend.rst | 4 +- .../source/source/paddleaudio.backends.rst | 19 + ...paddleaudio.backends.soundfile_backend.rst | 7 + .../paddleaudio.backends.sox_io_backend.rst | 4 +- .../source/paddleaudio.backends.utils.rst | 7 + .../source/paddleaudio.compliance.kaldi.rst | 7 + .../source/paddleaudio.compliance.librosa.rst | 7 + .../source/source/paddleaudio.compliance.rst | 16 + .../source/paddleaudio.datasets.dataset.rst | 7 + .../source/paddleaudio.datasets.esc50.rst | 7 + .../source/paddleaudio.datasets.gtzan.rst | 7 + .../source/paddleaudio.datasets.hey_snips.rst | 7 + .../paddleaudio.datasets.rirs_noises.rst | 4 +- .../source/source/paddleaudio.datasets.rst | 22 + .../source/paddleaudio.datasets.tess.rst | 7 + .../paddleaudio.datasets.urban_sound.rst | 4 +- .../source/paddleaudio.datasets.voxceleb.rst | 4 +- .../source/paddleaudio.features.layers.rst | 7 + .../source/source/paddleaudio.features.rst | 15 + .../paddleaudio.functional.functional.rst | 7 + .../source/source/paddleaudio.functional.rst | 16 + .../source/paddleaudio.functional.window.rst | 7 + audio/docs/source/source/paddleaudio.io.rst | 7 + .../source/source/paddleaudio.metric.eer.rst | 7 + .../docs/source/source/paddleaudio.metric.rst | 15 + audio/docs/source/source/paddleaudio.rst | 22 + .../source/source/paddleaudio.sox_effects.rst | 7 + .../api/paddlespeech.audio.backends.rst | 16 - ...peech.audio.backends.soundfile_backend.rst | 7 - ...addlespeech.audio.backends.sox_backend.rst | 7 - .../api/paddlespeech.audio.compliance.rst | 16 - ...addlespeech.audio.datasets.rirs_noises.rst | 7 - .../api/paddlespeech.audio.datasets.rst | 22 - ...addlespeech.audio.datasets.urban_sound.rst | 7 - ...ddlespeech.audio.functional.functional.rst | 7 - .../api/paddlespeech.audio.functional.rst | 16 - .../api/paddlespeech.audio.kaldi.kaldi.rst | 7 + docs/source/api/paddlespeech.audio.kaldi.rst | 15 + docs/source/api/paddlespeech.audio.metric.rst | 15 - docs/source/api/paddlespeech.audio.rst | 7 +- .../api/paddlespeech.audio.sox_effects.rst | 8 + ...lespeech.audio.sox_effects.sox_effects.rst | 7 + ...ddlespeech.audio.streamdata.autodecode.rst | 7 + ...> paddlespeech.audio.streamdata.cache.rst} | 4 +- ... paddlespeech.audio.streamdata.compat.rst} | 4 +- ...espeech.audio.streamdata.extradatasets.rst | 7 + .../paddlespeech.audio.streamdata.filters.rst | 7 + ...> paddlespeech.audio.streamdata.gopen.rst} | 4 +- ...paddlespeech.audio.streamdata.handlers.rst | 7 + .../api/paddlespeech.audio.streamdata.mix.rst | 7 + ...lespeech.audio.streamdata.paddle_utils.rst | 7 + ...paddlespeech.audio.streamdata.pipeline.rst | 7 + .../api/paddlespeech.audio.streamdata.rst | 28 + ...ddlespeech.audio.streamdata.shardlists.rst | 7 + ...lespeech.audio.streamdata.tariterators.rst | 7 + .../paddlespeech.audio.streamdata.utils.rst | 7 + ... paddlespeech.audio.streamdata.writer.rst} | 4 +- docs/source/api/paddlespeech.audio.text.rst | 16 + ...addlespeech.audio.text.text_featurizer.rst | 7 + .../api/paddlespeech.audio.text.utility.rst | 7 + ...addlespeech.audio.transform.add_deltas.rst | 7 + ...peech.audio.transform.channel_selector.rst | 7 + .../api/paddlespeech.audio.transform.cmvn.rst | 7 + ...addlespeech.audio.transform.functional.rst | 7 + .../paddlespeech.audio.transform.perturb.rst | 7 + .../api/paddlespeech.audio.transform.rst | 24 + ...dlespeech.audio.transform.spec_augment.rst | 7 + ...ddlespeech.audio.transform.spectrogram.rst | 7 + ...ch.audio.transform.transform_interface.rst | 7 + ...espeech.audio.transform.transformation.rst | 7 + .../api/paddlespeech.audio.transform.wpe.rst | 7 + ...paddlespeech.audio.utils.check_kwargs.rst} | 4 +- ...addlespeech.audio.utils.dynamic_import.rst | 7 + docs/source/api/paddlespeech.audio.utils.rst | 4 + .../paddlespeech.audio.utils.sox_utils.rst | 7 + .../paddlespeech.audio.utils.tensor_utils.rst | 7 + .../api/paddlespeech.cls.exps.panns.rst | 4 + .../paddlespeech.kws.exps.mdtc.collate.rst | 7 + ...paddlespeech.kws.exps.mdtc.compute_det.rst | 7 + ...dlespeech.kws.exps.mdtc.plot_det_curve.rst | 7 + .../source/api/paddlespeech.kws.exps.mdtc.rst | 19 + .../api/paddlespeech.kws.exps.mdtc.score.rst | 7 + .../api/paddlespeech.kws.exps.mdtc.train.rst | 7 + docs/source/api/paddlespeech.kws.exps.rst | 15 + docs/source/api/paddlespeech.kws.rst | 1 + .../api/paddlespeech.resource.model_alias.rst | 7 + ...addlespeech.resource.pretrained_models.rst | 7 + .../api/paddlespeech.resource.resource.rst | 7 + docs/source/api/paddlespeech.resource.rst | 17 + docs/source/api/paddlespeech.rst | 10 + docs/source/api/paddlespeech.s2t.rst | 1 - docs/source/api/paddlespeech.server.utils.rst | 1 - docs/source/api/paddlespeech.t2s.datasets.rst | 1 + .../api/paddlespeech.t2s.datasets.sampler.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.align.rst | 7 + ...dlespeech.t2s.exps.ernie_sat.normalize.rst | 7 + ...lespeech.t2s.exps.ernie_sat.preprocess.rst | 7 + .../api/paddlespeech.t2s.exps.ernie_sat.rst | 21 + ...lespeech.t2s.exps.ernie_sat.synthesize.rst | 7 + ...eech.t2s.exps.ernie_sat.synthesize_e2e.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.train.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.utils.rst | 7 + .../api/paddlespeech.t2s.exps.fastspeech2.rst | 1 + ...espeech.t2s.exps.fastspeech2.vc2_infer.rst | 7 + docs/source/api/paddlespeech.t2s.exps.rst | 3 + .../paddlespeech.t2s.exps.stream_play_tts.rst | 7 + .../paddlespeech.t2s.exps.vits.normalize.rst | 7 + .../paddlespeech.t2s.exps.vits.preprocess.rst | 7 + .../source/api/paddlespeech.t2s.exps.vits.rst | 20 + .../paddlespeech.t2s.exps.vits.synthesize.rst | 7 + ...dlespeech.t2s.exps.vits.synthesize_e2e.rst | 7 + .../api/paddlespeech.t2s.exps.vits.train.rst | 7 + ...ddlespeech.t2s.exps.vits.voice_cloning.rst | 7 + ...paddlespeech.t2s.frontend.g2pw.dataset.rst | 7 + ...addlespeech.t2s.frontend.g2pw.onnx_api.rst | 7 + .../api/paddlespeech.t2s.frontend.g2pw.rst | 17 + .../paddlespeech.t2s.frontend.g2pw.utils.rst | 7 + ...paddlespeech.t2s.frontend.mix_frontend.rst | 7 + docs/source/api/paddlespeech.t2s.frontend.rst | 2 + ...espeech.t2s.models.ernie_sat.ernie_sat.rst | 7 + ...t2s.models.ernie_sat.ernie_sat_updater.rst | 7 + .../api/paddlespeech.t2s.models.ernie_sat.rst | 3 +- ...h.t2s.models.vits.monotonic_align.core.rst | 7 + ...speech.t2s.models.vits.monotonic_align.rst | 16 + ....t2s.models.vits.monotonic_align.setup.rst | 7 + .../api/paddlespeech.utils.dynamic_import.rst | 7 + docs/source/api/paddlespeech.utils.env.rst | 7 + docs/source/api/paddlespeech.utils.rst | 16 + docs/source/api/paddlespeech.version.rst | 7 + docs/source/cls/custom_dataset.md | 4 +- examples/esc50/cls0/conf/panns.yaml | 2 +- examples/hey_snips/kws0/conf/mdtc.yaml | 2 +- examples/voxceleb/sv0/local/data_prepare.py | 2 +- .../make_rirs_noise_csv_dataset_from_json.py | 2 +- .../local/make_vox_csv_dataset_from_json.py | 2 +- paddlespeech/audio/backends/no_backend.py | 32 - .../audio/backends/soundfile_backend.py | 662 --------------- paddlespeech/audio/compliance/__init__.py | 15 - paddlespeech/audio/compliance/kaldi.py | 638 -------------- paddlespeech/audio/compliance/librosa.py | 788 ------------------ paddlespeech/audio/datasets/__init__.py | 20 - paddlespeech/audio/datasets/dataset.py | 100 --- paddlespeech/audio/datasets/esc50.py | 152 ---- paddlespeech/audio/datasets/gtzan.py | 115 --- paddlespeech/audio/datasets/hey_snips.py | 74 -- paddlespeech/audio/datasets/rirs_noises.py | 200 ----- paddlespeech/audio/datasets/tess.py | 126 --- paddlespeech/audio/datasets/urban_sound.py | 104 --- paddlespeech/audio/datasets/voxceleb.py | 355 -------- paddlespeech/audio/features/__init__.py | 17 - paddlespeech/audio/features/layers.py | 328 -------- paddlespeech/audio/functional/__init__.py | 20 - paddlespeech/audio/functional/functional.py | 266 ------ paddlespeech/audio/functional/window.py | 337 -------- paddlespeech/audio/io/__init__.py | 13 - paddlespeech/audio/metric/__init__.py | 15 - paddlespeech/audio/metric/eer.py | 100 --- paddlespeech/audio/streamdata/autodecode.py | 2 +- paddlespeech/audio/streamdata/tariterators.py | 4 +- paddlespeech/cli/cls/infer.py | 4 +- paddlespeech/cli/kws/infer.py | 4 +- paddlespeech/cli/vector/infer.py | 4 +- paddlespeech/cls/exps/panns/deploy/predict.py | 6 +- paddlespeech/cls/exps/panns/export_model.py | 2 +- paddlespeech/cls/exps/panns/predict.py | 6 +- paddlespeech/cls/exps/panns/train.py | 141 ++-- paddlespeech/cls/models/panns/panns.py | 2 +- paddlespeech/kws/exps/mdtc/train.py | 4 +- .../frontend/featurizer/audio_featurizer.py | 2 +- paddlespeech/s2t/models/u2/u2.py | 6 +- paddlespeech/s2t/models/u2_st/u2_st.py | 4 +- .../engine/vector/python/vector_engine.py | 4 +- paddlespeech/server/util.py | 4 +- .../vector/exps/ecapa_tdnn/extract_emb.py | 4 +- paddlespeech/vector/exps/ecapa_tdnn/test.py | 2 +- paddlespeech/vector/exps/ecapa_tdnn/train.py | 2 +- paddlespeech/vector/io/dataset.py | 4 +- paddlespeech/vector/io/dataset_from_json.py | 6 +- tests/benchmark/audio/README.md | 38 - tests/benchmark/audio/log_melspectrogram.py | 125 --- tests/benchmark/audio/melspectrogram.py | 109 --- tests/benchmark/audio/mfcc.py | 123 --- .../unit/audio/backends/soundfile/__init__.py | 13 - tests/unit/audio/backends/soundfile/common.py | 57 -- .../audio/backends/soundfile/info_test.py | 199 ----- .../audio/backends/soundfile/load_test.py | 369 -------- .../audio/backends/soundfile/save_test.py | 322 ------- .../unit/audio/backends/soundfile/test_io.py | 73 -- tests/unit/audio/features/base.py | 2 +- tests/unit/audio/features/test_istft.py | 2 +- tests/unit/audio/features/test_kaldi.py | 81 -- tests/unit/audio/features/test_librosa.py | 281 ------- .../audio/features/test_log_melspectrogram.py | 4 +- tests/unit/audio/features/test_spectrogram.py | 4 +- tests/unit/audio/features/test_stft.py | 2 +- 206 files changed, 1440 insertions(+), 6533 deletions(-) create mode 100644 audio/docs/Makefile create mode 100644 audio/docs/README.md create mode 100644 audio/docs/images/paddle.png create mode 100644 audio/docs/make.bat create mode 100644 audio/docs/source/_static/custom.css create mode 100644 audio/docs/source/_templates/module.rst_t create mode 100644 audio/docs/source/_templates/package.rst_t create mode 100644 audio/docs/source/_templates/toc.rst_t create mode 100644 audio/docs/source/conf.py create mode 100644 audio/docs/source/index.rst create mode 100644 audio/docs/source/source/paddleaudio.backends.common.rst rename docs/source/api/paddlespeech.audio.datasets.tess.rst => audio/docs/source/source/paddleaudio.backends.no_backend.rst (51%) create mode 100644 audio/docs/source/source/paddleaudio.backends.rst create mode 100644 audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst rename docs/source/api/paddlespeech.audio.compliance.librosa.rst => audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst (50%) create mode 100644 audio/docs/source/source/paddleaudio.backends.utils.rst create mode 100644 audio/docs/source/source/paddleaudio.compliance.kaldi.rst create mode 100644 audio/docs/source/source/paddleaudio.compliance.librosa.rst create mode 100644 audio/docs/source/source/paddleaudio.compliance.rst create mode 100644 audio/docs/source/source/paddleaudio.datasets.dataset.rst create mode 100644 audio/docs/source/source/paddleaudio.datasets.esc50.rst create mode 100644 audio/docs/source/source/paddleaudio.datasets.gtzan.rst create mode 100644 audio/docs/source/source/paddleaudio.datasets.hey_snips.rst rename docs/source/api/paddlespeech.audio.datasets.gtzan.rst => audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst (51%) create mode 100644 audio/docs/source/source/paddleaudio.datasets.rst create mode 100644 audio/docs/source/source/paddleaudio.datasets.tess.rst rename docs/source/api/paddlespeech.audio.datasets.esc50.rst => audio/docs/source/source/paddleaudio.datasets.urban_sound.rst (51%) rename docs/source/api/paddlespeech.audio.metric.eer.rst => audio/docs/source/source/paddleaudio.datasets.voxceleb.rst (52%) create mode 100644 audio/docs/source/source/paddleaudio.features.layers.rst create mode 100644 audio/docs/source/source/paddleaudio.features.rst create mode 100644 audio/docs/source/source/paddleaudio.functional.functional.rst create mode 100644 audio/docs/source/source/paddleaudio.functional.rst create mode 100644 audio/docs/source/source/paddleaudio.functional.window.rst create mode 100644 audio/docs/source/source/paddleaudio.io.rst create mode 100644 audio/docs/source/source/paddleaudio.metric.eer.rst create mode 100644 audio/docs/source/source/paddleaudio.metric.rst create mode 100644 audio/docs/source/source/paddleaudio.rst create mode 100644 audio/docs/source/source/paddleaudio.sox_effects.rst delete mode 100644 docs/source/api/paddlespeech.audio.backends.rst delete mode 100644 docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst delete mode 100644 docs/source/api/paddlespeech.audio.backends.sox_backend.rst delete mode 100644 docs/source/api/paddlespeech.audio.compliance.rst delete mode 100644 docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst delete mode 100644 docs/source/api/paddlespeech.audio.datasets.rst delete mode 100644 docs/source/api/paddlespeech.audio.datasets.urban_sound.rst delete mode 100644 docs/source/api/paddlespeech.audio.functional.functional.rst delete mode 100644 docs/source/api/paddlespeech.audio.functional.rst create mode 100644 docs/source/api/paddlespeech.audio.kaldi.kaldi.rst create mode 100644 docs/source/api/paddlespeech.audio.kaldi.rst delete mode 100644 docs/source/api/paddlespeech.audio.metric.rst create mode 100644 docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.autodecode.rst rename docs/source/api/{paddlespeech.audio.datasets.dataset.rst => paddlespeech.audio.streamdata.cache.rst} (50%) rename docs/source/api/{paddlespeech.audio.datasets.voxceleb.rst => paddlespeech.audio.streamdata.compat.rst} (50%) create mode 100644 docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.filters.rst rename docs/source/api/{paddlespeech.audio.compliance.kaldi.rst => paddlespeech.audio.streamdata.gopen.rst} (50%) create mode 100644 docs/source/api/paddlespeech.audio.streamdata.handlers.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.mix.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.pipeline.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.shardlists.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.tariterators.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.utils.rst rename docs/source/api/{paddlespeech.audio.functional.window.rst => paddlespeech.audio.streamdata.writer.rst} (50%) create mode 100644 docs/source/api/paddlespeech.audio.text.rst create mode 100644 docs/source/api/paddlespeech.audio.text.text_featurizer.rst create mode 100644 docs/source/api/paddlespeech.audio.text.utility.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.add_deltas.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.channel_selector.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.cmvn.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.functional.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.perturb.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.spec_augment.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.spectrogram.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.transform_interface.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.transformation.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.wpe.rst rename docs/source/api/{paddlespeech.audio.datasets.hey_snips.rst => paddlespeech.audio.utils.check_kwargs.rst} (50%) create mode 100644 docs/source/api/paddlespeech.audio.utils.dynamic_import.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.sox_utils.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.tensor_utils.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.score.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.train.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.rst create mode 100644 docs/source/api/paddlespeech.resource.model_alias.rst create mode 100644 docs/source/api/paddlespeech.resource.pretrained_models.rst create mode 100644 docs/source/api/paddlespeech.resource.resource.rst create mode 100644 docs/source/api/paddlespeech.resource.rst create mode 100644 docs/source/api/paddlespeech.t2s.datasets.sampler.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.train.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst create mode 100644 docs/source/api/paddlespeech.utils.dynamic_import.rst create mode 100644 docs/source/api/paddlespeech.utils.env.rst create mode 100644 docs/source/api/paddlespeech.utils.rst create mode 100644 docs/source/api/paddlespeech.version.rst delete mode 100644 paddlespeech/audio/backends/no_backend.py delete mode 100644 paddlespeech/audio/backends/soundfile_backend.py delete mode 100644 paddlespeech/audio/compliance/__init__.py delete mode 100644 paddlespeech/audio/compliance/kaldi.py delete mode 100644 paddlespeech/audio/compliance/librosa.py delete mode 100644 paddlespeech/audio/datasets/__init__.py delete mode 100644 paddlespeech/audio/datasets/dataset.py delete mode 100644 paddlespeech/audio/datasets/esc50.py delete mode 100644 paddlespeech/audio/datasets/gtzan.py delete mode 100644 paddlespeech/audio/datasets/hey_snips.py delete mode 100644 paddlespeech/audio/datasets/rirs_noises.py delete mode 100644 paddlespeech/audio/datasets/tess.py delete mode 100644 paddlespeech/audio/datasets/urban_sound.py delete mode 100644 paddlespeech/audio/datasets/voxceleb.py delete mode 100644 paddlespeech/audio/features/__init__.py delete mode 100644 paddlespeech/audio/features/layers.py delete mode 100644 paddlespeech/audio/functional/__init__.py delete mode 100644 paddlespeech/audio/functional/functional.py delete mode 100644 paddlespeech/audio/functional/window.py delete mode 100644 paddlespeech/audio/io/__init__.py delete mode 100644 paddlespeech/audio/metric/__init__.py delete mode 100644 paddlespeech/audio/metric/eer.py delete mode 100644 tests/benchmark/audio/README.md delete mode 100644 tests/benchmark/audio/log_melspectrogram.py delete mode 100644 tests/benchmark/audio/melspectrogram.py delete mode 100644 tests/benchmark/audio/mfcc.py delete mode 100644 tests/unit/audio/backends/soundfile/__init__.py delete mode 100644 tests/unit/audio/backends/soundfile/common.py delete mode 100644 tests/unit/audio/backends/soundfile/info_test.py delete mode 100644 tests/unit/audio/backends/soundfile/load_test.py delete mode 100644 tests/unit/audio/backends/soundfile/save_test.py delete mode 100644 tests/unit/audio/backends/soundfile/test_io.py delete mode 100644 tests/unit/audio/features/test_kaldi.py delete mode 100644 tests/unit/audio/features/test_librosa.py diff --git a/audio/docs/Makefile b/audio/docs/Makefile new file mode 100644 index 000000000..69fe55ecf --- /dev/null +++ b/audio/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/audio/docs/README.md b/audio/docs/README.md new file mode 100644 index 000000000..20626f52b --- /dev/null +++ b/audio/docs/README.md @@ -0,0 +1,24 @@ +# Build docs for PaddleAudio + +Execute the following steps in **current directory**. + +## 1. Install + +`pip install Sphinx sphinx_rtd_theme` + + +## 2. Generate API docs + +Generate API docs from doc string. + +`sphinx-apidoc -fMeT -o source ../paddleaudio ../paddleaudio/utils --templatedir source/_templates` + + +## 3. Build + +`sphinx-build source _html` + + +## 4. Preview + +Open `_html/index.html` for page preview. diff --git a/audio/docs/images/paddle.png b/audio/docs/images/paddle.png new file mode 100644 index 0000000000000000000000000000000000000000..bc1135abfab7aa48f29392da4bca614f688314af GIT binary patch literal 5043 zcmV;k6HM%hP)Px|ZAnByRCodHoe8iVMY+cbn;1d107fAM5|&a3LJ*P7Ld`R*M!^N#5=2ErP!zR< zA_!7S^daytPsEDif&wLr7zGqC2&iDt(hzpCh!7y4kbpu=uD}1`&Y@4AUS`hpOwPT| zSM|^Ie0%py&-8Tnxf!v>Fr=Zw{S0Rd&xAY^R)hjanbHz;NnVFVESY<8$iOrM7j>K$_5K zXo$4HaOzl3MCEPJ8B=PM^~U?QW)Un|Tr_|2ZIQ zL`@4+*8+U_enJoG)_deg@tDHFZ@+|05huw)o#i_Y{lQIz@k73U>l~eN&sZ4{+J!;pwTdT8IiM z_F949&r`7hpB($0k!@nh|Hb)Yuq0$_+2lSYrfvj1J?$GJ^P#7lcgi38m!h~wraX95 z9km000k+RrvQ($KX(pv|X8BQlPq28gQ|Y_f@CUXbZSq&zZ8rRaE!k=CFG2p0#ovtc zF&0nXtUj87K<#7GZ}D%(v~|pLUB~(4lXqRxv`tec;myE5L5TDb?<9;~0sa**eiqqm z@F9?QM}{+&$;8*GAViNMblos5H^_7|gh-o&&jtIKam$wRcT6Z~N!f)(@*~!0@Vq1D zAAEk$-3_>Yv2Zo~jjf?FyXbtF`c~36mbYUe(_2dGg8v3p&Y_pWPjw063=MP602Ah_$2-)13St~P*iE3jt?KMrp%|Y2;+*zAzZriCg*OmmVaA9Zi8}?I zD*}4D;kU=*ymfy(23(!tCh~b>-XQqe2wh!!O)i-9 z3@2OT>j{(TE8|D^M%Qehp|8o5*4xYAeFy(X4c2Fs!Ox0!QJ9yVvPzrRiR>sfe`6cy0tGVvMC|s0y4e1LNk&bDWDtsh3};u2G$AoH-Z$Tw{>kq!l55w z9#|w1e0|T;0ADNON4og=qQ{qyJ!{ZTLqU`Dji_OyW@`%(HkxR%Db(Q$`7fkQ7Mj8g z(ZwN$7>)vaSa?0>`+>v3*T91y4}CHeE0f`9&m7KzlML9u2H1IP=rRd>1nBQt#AZoR zX?>fz-X2-HNsq+4NkJX-7kIfxJLrDJ;X^FV?2hgwY3z;%uGvM{U1WDGbtp|<>}tm5 zd@R_&P97t(V`)}$Thk}!F9Fkm z9&YMz-nj>4Yl3;eCf!dOBGyX;)l)e{*Qd;gKzh6Rp14^a9}LArNz<~fo${Hzj)#PN z6qVsm!>`cWOEYNpH$>4eQGH@u3S6`pragJk_J>T%3cCW$WZfRDAn zQQ-H$N3D)!yFgh(!y|#6=MUl8-%l-b_{Rbl)$+jWr~_GS4`NT^$sI-sPKdb`&|AP{03F! z@Ef{G|KW=ZUyZJ{sSq7qMFSG>Wk8GMKGGC?Wkk^b^8|hperveH@oJAwjI9T3uAyX+ z3gUA-xX%;Z*|Fpw{eyw;p?@o~?FRKU?DX9aUoDbr5?zwn9E658hzF)73FdHu?}-W! zpz#hfHTbCk8;O>H;G(9>Z(bh0zpo%3nw{5AQ9a^q0+xH?o-TjLw8f|wT87bkisls4 zmrtNh?;~%=8mv#~TI=4`weW4By8~I}L-VdSAZX}~fq39=RVIe-4HP6m6YZqY)pvrv zWkTfLzGVCB<($AL2ZG<+i_bhZfw&q{x7MIIUJ=O6Cz^v`$2>XY)xn=0+8VOODA=(S z4+k@1P8@Q*!3u5Z%3HUf9zjQfrH+J5mUsJ7(0KtN!8GLak*i~s+P+jEHLohkPW2@W zzmkrHU(Jp^;)(kWC5vnV@C}`8)2Crv4)*p|v^Er!cZ+&#L?9Ad{4X|r;b;|A2}TjLZgl>Z4KYA%}$JC>y5>^ z>UaXs=gEcOI7p?gq%O_BVQ>Jt30NVb}y^lo2DkqrJd6jej`=ngv$ilw$M+5UPt zCx=+u1=rttjCqi+k>Dp;xbGz|*%1G{KyI06Hia#K>)X`qSebs&qo(n-*xfM&@2gXl zL&1}LK?nJl@+w^v(J3qbIu5oEIQg|HqtP_XiG2?g-rHuzh?U7f!;#SL{sH83Vt#s% zf5v0;E3Q-P0Yq-cq4O4p-v6U>t%F|&OT*7oi9b`p$T4-y=WzI4FoPHmrgmP)AC=n& z#Hg~TAc4n#Ya$6~NY}B;p7UC^FZmji_(y=pL5RxklJ5#nu-yXz+Z_p=$sWB0@LTI! zUCt9Q@a=kIX;SP9pkwz2=c(RUx8duPV2fU=(0*g#U<3}{2DH7uEKQ3zxFLHAN{`qY zvrdTE#Kg*96D8UZI|alp^yTF60D3JVhw&6-7ozst3Vls-915cQ4MPr*X)SCFI1uFd z$w3ucXMArf{g0BrBCi%v*28KpCld!**T#j zT2!|=KtGB38^`iQgS`vXMU%6P38*!r*iYV5=k*j0?YGs;NRW>0OXyCE=vN`WIBU05 zN0J;1^!9eqyX`nA?dMV)0nzgGU zr>=c=1eqQQv;=v3`x4~?Bl@YuTD09MYqwNKeCkG9=1KNgUmWx|28%s*Vr%xy*(rTK z>)?~q0ZPt6N2HFg?-IQtFX!;~)Xp9L9)`CK<5&HBT5c3$yOWBK}!X)jOTOyW;1qHh94V>YCqT}hWv@LZm{9T04*J1k;u$9Aa9@bRiUgT25_fO#;;*ure=|N^b z^m}#tXX>DbbsD1Lv8nnWriql!3pwW*itdpvV*hxe?$-kseXmTGI2;AO30AnAM|j@F zW$a!GHc5c@dvZ9kOuZ|yCDDC7$(rolDhXb~DaozY?Mo$6FJ(%-cQr4JQPw0xk|}-s z>;UvTtRm#?Sfwg#7?aFoPeRG3+npcvMlN}ZvDNmD?MrzaR;@$hjwi;HNSyZerHI~8 z6Z@{$FBrzmOHmp8+=HKY(9Sz@)MtXQ3#gQcH<|!?>#l1B(l^|}7O(3R`zv7D6E|9> zIgUEr*=0}_gGBUX3Qe~n9%yf0sw$77e#vR35(`kr_NAi!`&FO%jsd=5+8e9Po4DJ7 zW5L-#v!3(7vmm3=Zw(yjrz0+!rMe*qzOj&h(Pa#8E?B3^_UZ6VFs-wMC^jqC;jfMp zlGS&Bc|l$IA6-W-b;qEhGhCI0r&+q%iMZN%qwgo7)I=Kg`Z1)7^v7VvsB#=*9ZP>7 zcjmp(Oa0zmTE4sm}^sg`FXzwl+a$67ho=92hCwCbR0|u3w zkX6ueJZPv^k-B^s_;%;tj!bK;EutLlSW-k!LscJ5f5BS?{}OQ|SnYp(sc6;6IktUC zinRPfCJCGvNurLwz7$EJGV!Ydo~P43W0Btq7K2rwZqnlF z5?@gA(>K^SpoM5H+P@9>zRR@9`yKdO$gwbC_ayPOApKjgp~Jx>c$3h%k)sy1ZwGf{ z*L%C+Y%TBr2+`RD>MJ7IDWQasX*jV>UnQL|8R|AIQvcO0KFL>I5<3(4{>HkUw>JL1 z)#VF*t~XX@i@r8o$Np3TDlO7p*jbsap5jI6?HFtuFw(Xddw*R(y*82e!VPAipXx8D zEt~oS{i*(t$s%e)@8Bt6!*#qihi+5_KXXyq59GNo)<;io(-!s8v0^u<{`!)J z6MaKGN!~y*dqooZYD7My#at6jsoyfBD-llqYQ#HH&!lz4-(C3K1$-wQzEfJLV{wD- zi+ODSTtDtu@a;CpT0?(4MC0V)PWm=^aF|u{4(g-tsY8MOifwYj?=o-{j^7M^ohLw{ zj)lY6+v<7}(37!mh=)O*?MuV@ZIxXNuFb=gcLvjep2S0ywEO;BK*PR?;EFmbLL6<+ z+n-n({hSN#^TqYes&wgdgYE!R7b9eI0D-URGG2`g@}5cfj|EcDZ>wHS-U9)hVJV_* zHlt#r!N#E18RArJqo1`u>T*V&+UKWP=!d{U^d4~O#d!GH2c0V&dLP8+YzIHe!si&2 zwgp>)mH7El)W}r8L9>0yY>=khv_RPwpi^xDn(a$v8?v!)TEJ`pI#siM$!w6O+_XU1 z7NApU`_dC2MD2}^W#gr?C)r~{2_qY}+m}+cVdJO57T^XvE^>oDjeNrjQz-13eDv3s z?v3PqUSY?LLDK>$T7Vm~*}jybAsZh}3z#iH$9iYvsjpw7YPzwnh@5VT{=kUcA*{7q51te%-b*Sr&gguX`O9pAn#SkP6G`!El^nt{0HuA9HmP~`H}zt002ov JPDHLkV1n`1NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/audio/docs/source/_static/custom.css b/audio/docs/source/_static/custom.css new file mode 100644 index 000000000..bb65c51a9 --- /dev/null +++ b/audio/docs/source/_static/custom.css @@ -0,0 +1,5 @@ +.wy-nav-content { + max-width: 80%; +} +.table table{ background:#b9b9b9} +.table table td{ background:#FFF; } diff --git a/audio/docs/source/_templates/module.rst_t b/audio/docs/source/_templates/module.rst_t new file mode 100644 index 000000000..d9a50e6b9 --- /dev/null +++ b/audio/docs/source/_templates/module.rst_t @@ -0,0 +1,9 @@ +{%- if show_headings %} +{{- basename | e | heading }} + +{% endif -%} +.. automodule:: {{ qualname }} +{%- for option in automodule_options %} + :{{ option }}: +{%- endfor %} + diff --git a/audio/docs/source/_templates/package.rst_t b/audio/docs/source/_templates/package.rst_t new file mode 100644 index 000000000..7239c11b7 --- /dev/null +++ b/audio/docs/source/_templates/package.rst_t @@ -0,0 +1,57 @@ +{%- macro automodule(modname, options) -%} +.. automodule:: {{ modname }} +{%- for option in options %} + :{{ option }}: +{%- endfor %} +{%- endmacro %} + +{%- macro toctree(docnames) -%} +.. toctree:: + :maxdepth: {{ maxdepth }} +{% for docname in docnames %} + {{ docname }} +{%- endfor %} +{%- endmacro %} + +{%- if is_namespace %} +{{- [pkgname, "namespace"] | join(" ") | e | heading }} +{% else %} +{{- pkgname | e | heading }} +{% endif %} + +{%- if is_namespace %} +.. py:module:: {{ pkgname }} +{% endif %} + +{%- if modulefirst and not is_namespace %} +{{ automodule(pkgname, automodule_options) }} +{% endif %} + +{%- if subpackages %} +Subpackages +----------- + +{{ toctree(subpackages) }} +{% endif %} + +{%- if submodules %} +Submodules +---------- +{% if separatemodules %} +{{ toctree(submodules) }} +{% else %} +{%- for submodule in submodules %} +{% if show_headings %} +{{- submodule | e | heading(2) }} +{% endif %} +{{ automodule(submodule, automodule_options) }} +{% endfor %} +{%- endif %} +{%- endif %} + +{%- if not modulefirst and not is_namespace %} +Module contents +--------------- + +{{ automodule(pkgname, automodule_options) }} +{% endif %} diff --git a/audio/docs/source/_templates/toc.rst_t b/audio/docs/source/_templates/toc.rst_t new file mode 100644 index 000000000..f0877eeb2 --- /dev/null +++ b/audio/docs/source/_templates/toc.rst_t @@ -0,0 +1,8 @@ +{{ header | heading }} + +.. toctree:: + :maxdepth: {{ maxdepth }} +{% for docname in docnames %} + {{ docname }} +{%- endfor %} + diff --git a/audio/docs/source/conf.py b/audio/docs/source/conf.py new file mode 100644 index 000000000..09c4f312f --- /dev/null +++ b/audio/docs/source/conf.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config +# -- Path setup -------------------------------------------------------------- +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +import os +import sys +sys.path.insert(0, os.path.abspath('../..')) + +# -- Project information ----------------------------------------------------- + +project = 'PaddleAudio' +copyright = '2022, PaddlePaddle' +author = 'PaddlePaddle' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '0.2.0' + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', +] + +napoleon_google_docstring = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# + +import sphinx_rtd_theme +html_theme = 'sphinx_rtd_theme' +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +smartquotes = False + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_logo = '../images/paddle.png' +html_css_files = [ + 'custom.css', +] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'PaddleAudiodoc' + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'PaddleAudio.tex', 'PaddleAudio Documentation', 'PaddlePaddle', + 'manual'), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, 'paddleaudio', 'PaddleAudio Documentation', [author], + 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'PaddleAudio', 'PaddleAudio Documentation', author, + 'PaddleAudio', 'One line description of project.', 'Miscellaneous'), +] + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} diff --git a/audio/docs/source/index.rst b/audio/docs/source/index.rst new file mode 100644 index 000000000..26963308e --- /dev/null +++ b/audio/docs/source/index.rst @@ -0,0 +1,22 @@ +.. PaddleAudio documentation master file, created by + sphinx-quickstart on Tue Mar 22 15:57:16 2022. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to PaddleAudio's documentation! +======================================= + +.. toctree:: + :maxdepth: 1 + + Index + + +API References +-------------- + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + paddleaudio \ No newline at end of file diff --git a/audio/docs/source/source/paddleaudio.backends.common.rst b/audio/docs/source/source/paddleaudio.backends.common.rst new file mode 100644 index 000000000..c936645e6 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.backends.common.rst @@ -0,0 +1,7 @@ +paddleaudio.backends.common module +================================== + +.. automodule:: paddleaudio.backends.common + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.tess.rst b/audio/docs/source/source/paddleaudio.backends.no_backend.rst similarity index 51% rename from docs/source/api/paddlespeech.audio.datasets.tess.rst rename to audio/docs/source/source/paddleaudio.backends.no_backend.rst index d845e6d6a..bf01dab2e 100644 --- a/docs/source/api/paddlespeech.audio.datasets.tess.rst +++ b/audio/docs/source/source/paddleaudio.backends.no_backend.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.datasets.tess module +paddleaudio.backends.no\_backend module ======================================= -.. automodule:: paddlespeech.audio.datasets.tess +.. automodule:: paddleaudio.backends.no_backend :members: :undoc-members: :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.backends.rst b/audio/docs/source/source/paddleaudio.backends.rst new file mode 100644 index 000000000..79907dd2e --- /dev/null +++ b/audio/docs/source/source/paddleaudio.backends.rst @@ -0,0 +1,19 @@ +paddleaudio.backends package +============================ + +.. automodule:: paddleaudio.backends + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.backends.common + paddleaudio.backends.no_backend + paddleaudio.backends.soundfile_backend + paddleaudio.backends.sox_io_backend + paddleaudio.backends.utils diff --git a/audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst b/audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst new file mode 100644 index 000000000..6146373cb --- /dev/null +++ b/audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst @@ -0,0 +1,7 @@ +paddleaudio.backends.soundfile\_backend module +============================================== + +.. automodule:: paddleaudio.backends.soundfile_backend + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.compliance.librosa.rst b/audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst similarity index 50% rename from docs/source/api/paddlespeech.audio.compliance.librosa.rst rename to audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst index 85271bee4..04972706d 100644 --- a/docs/source/api/paddlespeech.audio.compliance.librosa.rst +++ b/audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.compliance.librosa module +paddleaudio.backends.sox\_io\_backend module ============================================ -.. automodule:: paddlespeech.audio.compliance.librosa +.. automodule:: paddleaudio.backends.sox_io_backend :members: :undoc-members: :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.backends.utils.rst b/audio/docs/source/source/paddleaudio.backends.utils.rst new file mode 100644 index 000000000..c4cd5e1ed --- /dev/null +++ b/audio/docs/source/source/paddleaudio.backends.utils.rst @@ -0,0 +1,7 @@ +paddleaudio.backends.utils module +================================= + +.. automodule:: paddleaudio.backends.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.compliance.kaldi.rst b/audio/docs/source/source/paddleaudio.compliance.kaldi.rst new file mode 100644 index 000000000..81bb7d648 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.compliance.kaldi.rst @@ -0,0 +1,7 @@ +paddleaudio.compliance.kaldi module +=================================== + +.. automodule:: paddleaudio.compliance.kaldi + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.compliance.librosa.rst b/audio/docs/source/source/paddleaudio.compliance.librosa.rst new file mode 100644 index 000000000..553e4d3a5 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.compliance.librosa.rst @@ -0,0 +1,7 @@ +paddleaudio.compliance.librosa module +===================================== + +.. automodule:: paddleaudio.compliance.librosa + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.compliance.rst b/audio/docs/source/source/paddleaudio.compliance.rst new file mode 100644 index 000000000..137599bb3 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.compliance.rst @@ -0,0 +1,16 @@ +paddleaudio.compliance package +============================== + +.. automodule:: paddleaudio.compliance + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.compliance.kaldi + paddleaudio.compliance.librosa diff --git a/audio/docs/source/source/paddleaudio.datasets.dataset.rst b/audio/docs/source/source/paddleaudio.datasets.dataset.rst new file mode 100644 index 000000000..ebf4ea18a --- /dev/null +++ b/audio/docs/source/source/paddleaudio.datasets.dataset.rst @@ -0,0 +1,7 @@ +paddleaudio.datasets.dataset module +=================================== + +.. automodule:: paddleaudio.datasets.dataset + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.datasets.esc50.rst b/audio/docs/source/source/paddleaudio.datasets.esc50.rst new file mode 100644 index 000000000..2730fb919 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.datasets.esc50.rst @@ -0,0 +1,7 @@ +paddleaudio.datasets.esc50 module +================================= + +.. automodule:: paddleaudio.datasets.esc50 + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.datasets.gtzan.rst b/audio/docs/source/source/paddleaudio.datasets.gtzan.rst new file mode 100644 index 000000000..da3600cb9 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.datasets.gtzan.rst @@ -0,0 +1,7 @@ +paddleaudio.datasets.gtzan module +================================= + +.. automodule:: paddleaudio.datasets.gtzan + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.datasets.hey_snips.rst b/audio/docs/source/source/paddleaudio.datasets.hey_snips.rst new file mode 100644 index 000000000..29da9fa88 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.datasets.hey_snips.rst @@ -0,0 +1,7 @@ +paddleaudio.datasets.hey\_snips module +====================================== + +.. automodule:: paddleaudio.datasets.hey_snips + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.gtzan.rst b/audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst similarity index 51% rename from docs/source/api/paddlespeech.audio.datasets.gtzan.rst rename to audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst index 47252e8d7..26f52346a 100644 --- a/docs/source/api/paddlespeech.audio.datasets.gtzan.rst +++ b/audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.datasets.gtzan module +paddleaudio.datasets.rirs\_noises module ======================================== -.. automodule:: paddlespeech.audio.datasets.gtzan +.. automodule:: paddleaudio.datasets.rirs_noises :members: :undoc-members: :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.datasets.rst b/audio/docs/source/source/paddleaudio.datasets.rst new file mode 100644 index 000000000..7a0b6f3f7 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.datasets.rst @@ -0,0 +1,22 @@ +paddleaudio.datasets package +============================ + +.. automodule:: paddleaudio.datasets + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.datasets.dataset + paddleaudio.datasets.esc50 + paddleaudio.datasets.gtzan + paddleaudio.datasets.hey_snips + paddleaudio.datasets.rirs_noises + paddleaudio.datasets.tess + paddleaudio.datasets.urban_sound + paddleaudio.datasets.voxceleb diff --git a/audio/docs/source/source/paddleaudio.datasets.tess.rst b/audio/docs/source/source/paddleaudio.datasets.tess.rst new file mode 100644 index 000000000..7a4ad62a3 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.datasets.tess.rst @@ -0,0 +1,7 @@ +paddleaudio.datasets.tess module +================================ + +.. automodule:: paddleaudio.datasets.tess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.esc50.rst b/audio/docs/source/source/paddleaudio.datasets.urban_sound.rst similarity index 51% rename from docs/source/api/paddlespeech.audio.datasets.esc50.rst rename to audio/docs/source/source/paddleaudio.datasets.urban_sound.rst index 80e4a4187..ee4ad47ec 100644 --- a/docs/source/api/paddlespeech.audio.datasets.esc50.rst +++ b/audio/docs/source/source/paddleaudio.datasets.urban_sound.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.datasets.esc50 module +paddleaudio.datasets.urban\_sound module ======================================== -.. automodule:: paddlespeech.audio.datasets.esc50 +.. automodule:: paddleaudio.datasets.urban_sound :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.metric.eer.rst b/audio/docs/source/source/paddleaudio.datasets.voxceleb.rst similarity index 52% rename from docs/source/api/paddlespeech.audio.metric.eer.rst rename to audio/docs/source/source/paddleaudio.datasets.voxceleb.rst index bbe881221..b8f903666 100644 --- a/docs/source/api/paddlespeech.audio.metric.eer.rst +++ b/audio/docs/source/source/paddleaudio.datasets.voxceleb.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.metric.eer module +paddleaudio.datasets.voxceleb module ==================================== -.. automodule:: paddlespeech.audio.metric.eer +.. automodule:: paddleaudio.datasets.voxceleb :members: :undoc-members: :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.features.layers.rst b/audio/docs/source/source/paddleaudio.features.layers.rst new file mode 100644 index 000000000..90833e0a8 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.features.layers.rst @@ -0,0 +1,7 @@ +paddleaudio.features.layers module +================================== + +.. automodule:: paddleaudio.features.layers + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.features.rst b/audio/docs/source/source/paddleaudio.features.rst new file mode 100644 index 000000000..86ecb5c9c --- /dev/null +++ b/audio/docs/source/source/paddleaudio.features.rst @@ -0,0 +1,15 @@ +paddleaudio.features package +============================ + +.. automodule:: paddleaudio.features + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.features.layers diff --git a/audio/docs/source/source/paddleaudio.functional.functional.rst b/audio/docs/source/source/paddleaudio.functional.functional.rst new file mode 100644 index 000000000..d1f72052d --- /dev/null +++ b/audio/docs/source/source/paddleaudio.functional.functional.rst @@ -0,0 +1,7 @@ +paddleaudio.functional.functional module +======================================== + +.. automodule:: paddleaudio.functional.functional + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.functional.rst b/audio/docs/source/source/paddleaudio.functional.rst new file mode 100644 index 000000000..be76de798 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.functional.rst @@ -0,0 +1,16 @@ +paddleaudio.functional package +============================== + +.. automodule:: paddleaudio.functional + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.functional.functional + paddleaudio.functional.window diff --git a/audio/docs/source/source/paddleaudio.functional.window.rst b/audio/docs/source/source/paddleaudio.functional.window.rst new file mode 100644 index 000000000..46d89f3fb --- /dev/null +++ b/audio/docs/source/source/paddleaudio.functional.window.rst @@ -0,0 +1,7 @@ +paddleaudio.functional.window module +==================================== + +.. automodule:: paddleaudio.functional.window + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.io.rst b/audio/docs/source/source/paddleaudio.io.rst new file mode 100644 index 000000000..9ef75f748 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.io.rst @@ -0,0 +1,7 @@ +paddleaudio.io package +====================== + +.. automodule:: paddleaudio.io + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.metric.eer.rst b/audio/docs/source/source/paddleaudio.metric.eer.rst new file mode 100644 index 000000000..e4b4f5f34 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.metric.eer.rst @@ -0,0 +1,7 @@ +paddleaudio.metric.eer module +============================= + +.. automodule:: paddleaudio.metric.eer + :members: + :undoc-members: + :show-inheritance: diff --git a/audio/docs/source/source/paddleaudio.metric.rst b/audio/docs/source/source/paddleaudio.metric.rst new file mode 100644 index 000000000..0074f0b5b --- /dev/null +++ b/audio/docs/source/source/paddleaudio.metric.rst @@ -0,0 +1,15 @@ +paddleaudio.metric package +========================== + +.. automodule:: paddleaudio.metric + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.metric.eer diff --git a/audio/docs/source/source/paddleaudio.rst b/audio/docs/source/source/paddleaudio.rst new file mode 100644 index 000000000..9defb2ea8 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.rst @@ -0,0 +1,22 @@ +paddleaudio package +=================== + +.. automodule:: paddleaudio + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + paddleaudio.backends + paddleaudio.compliance + paddleaudio.datasets + paddleaudio.features + paddleaudio.functional + paddleaudio.io + paddleaudio.metric + paddleaudio.sox_effects diff --git a/audio/docs/source/source/paddleaudio.sox_effects.rst b/audio/docs/source/source/paddleaudio.sox_effects.rst new file mode 100644 index 000000000..48cd27250 --- /dev/null +++ b/audio/docs/source/source/paddleaudio.sox_effects.rst @@ -0,0 +1,7 @@ +paddleaudio.sox\_effects package +================================ + +.. automodule:: paddleaudio.sox_effects + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.backends.rst b/docs/source/api/paddlespeech.audio.backends.rst deleted file mode 100644 index e8917897e..000000000 --- a/docs/source/api/paddlespeech.audio.backends.rst +++ /dev/null @@ -1,16 +0,0 @@ -paddlespeech.audio.backends package -=================================== - -.. automodule:: paddlespeech.audio.backends - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.audio.backends.soundfile_backend - paddlespeech.audio.backends.sox_backend diff --git a/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst b/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst deleted file mode 100644 index 5c4ef3881..000000000 --- a/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.audio.backends.soundfile\_backend module -===================================================== - -.. automodule:: paddlespeech.audio.backends.soundfile_backend - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.backends.sox_backend.rst b/docs/source/api/paddlespeech.audio.backends.sox_backend.rst deleted file mode 100644 index a99c49de8..000000000 --- a/docs/source/api/paddlespeech.audio.backends.sox_backend.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.audio.backends.sox\_backend module -=============================================== - -.. automodule:: paddlespeech.audio.backends.sox_backend - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.compliance.rst b/docs/source/api/paddlespeech.audio.compliance.rst deleted file mode 100644 index 515d25e99..000000000 --- a/docs/source/api/paddlespeech.audio.compliance.rst +++ /dev/null @@ -1,16 +0,0 @@ -paddlespeech.audio.compliance package -===================================== - -.. automodule:: paddlespeech.audio.compliance - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.audio.compliance.kaldi - paddlespeech.audio.compliance.librosa diff --git a/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst b/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst deleted file mode 100644 index 3015ba9e4..000000000 --- a/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.audio.datasets.rirs\_noises module -=============================================== - -.. automodule:: paddlespeech.audio.datasets.rirs_noises - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.rst b/docs/source/api/paddlespeech.audio.datasets.rst deleted file mode 100644 index bfc313a70..000000000 --- a/docs/source/api/paddlespeech.audio.datasets.rst +++ /dev/null @@ -1,22 +0,0 @@ -paddlespeech.audio.datasets package -=================================== - -.. automodule:: paddlespeech.audio.datasets - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.audio.datasets.dataset - paddlespeech.audio.datasets.esc50 - paddlespeech.audio.datasets.gtzan - paddlespeech.audio.datasets.hey_snips - paddlespeech.audio.datasets.rirs_noises - paddlespeech.audio.datasets.tess - paddlespeech.audio.datasets.urban_sound - paddlespeech.audio.datasets.voxceleb diff --git a/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst b/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst deleted file mode 100644 index 4efa060a8..000000000 --- a/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.audio.datasets.urban\_sound module -=============================================== - -.. automodule:: paddlespeech.audio.datasets.urban_sound - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.functional.functional.rst b/docs/source/api/paddlespeech.audio.functional.functional.rst deleted file mode 100644 index 80cc5a5a4..000000000 --- a/docs/source/api/paddlespeech.audio.functional.functional.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.audio.functional.functional module -=============================================== - -.. automodule:: paddlespeech.audio.functional.functional - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.functional.rst b/docs/source/api/paddlespeech.audio.functional.rst deleted file mode 100644 index 4e979dd9a..000000000 --- a/docs/source/api/paddlespeech.audio.functional.rst +++ /dev/null @@ -1,16 +0,0 @@ -paddlespeech.audio.functional package -===================================== - -.. automodule:: paddlespeech.audio.functional - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.audio.functional.functional - paddlespeech.audio.functional.window diff --git a/docs/source/api/paddlespeech.audio.kaldi.kaldi.rst b/docs/source/api/paddlespeech.audio.kaldi.kaldi.rst new file mode 100644 index 000000000..1c41ac84d --- /dev/null +++ b/docs/source/api/paddlespeech.audio.kaldi.kaldi.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.kaldi.kaldi module +===================================== + +.. automodule:: paddlespeech.audio.kaldi.kaldi + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.kaldi.rst b/docs/source/api/paddlespeech.audio.kaldi.rst new file mode 100644 index 000000000..15d26a74c --- /dev/null +++ b/docs/source/api/paddlespeech.audio.kaldi.rst @@ -0,0 +1,15 @@ +paddlespeech.audio.kaldi package +================================ + +.. automodule:: paddlespeech.audio.kaldi + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.kaldi.kaldi diff --git a/docs/source/api/paddlespeech.audio.metric.rst b/docs/source/api/paddlespeech.audio.metric.rst deleted file mode 100644 index a6d411dd6..000000000 --- a/docs/source/api/paddlespeech.audio.metric.rst +++ /dev/null @@ -1,15 +0,0 @@ -paddlespeech.audio.metric package -================================= - -.. automodule:: paddlespeech.audio.metric - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.audio.metric.eer diff --git a/docs/source/api/paddlespeech.audio.rst b/docs/source/api/paddlespeech.audio.rst index 5a3867f96..368ffda94 100644 --- a/docs/source/api/paddlespeech.audio.rst +++ b/docs/source/api/paddlespeech.audio.rst @@ -12,12 +12,13 @@ Subpackages .. toctree:: :maxdepth: 4 - paddlespeech.audio.backends - paddlespeech.audio.compliance - paddlespeech.audio.datasets paddlespeech.audio.features paddlespeech.audio.functional paddlespeech.audio.io + paddlespeech.audio.kaldi paddlespeech.audio.metric paddlespeech.audio.sox_effects + paddlespeech.audio.streamdata + paddlespeech.audio.text + paddlespeech.audio.transform paddlespeech.audio.utils diff --git a/docs/source/api/paddlespeech.audio.sox_effects.rst b/docs/source/api/paddlespeech.audio.sox_effects.rst index 75f991a16..186b9738f 100644 --- a/docs/source/api/paddlespeech.audio.sox_effects.rst +++ b/docs/source/api/paddlespeech.audio.sox_effects.rst @@ -5,3 +5,11 @@ paddlespeech.audio.sox\_effects package :members: :undoc-members: :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.sox_effects.sox_effects diff --git a/docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst b/docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst new file mode 100644 index 000000000..8232b4391 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.sox\_effects.sox\_effects module +=================================================== + +.. automodule:: paddlespeech.audio.sox_effects.sox_effects + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst new file mode 100644 index 000000000..1e45c1373 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.autodecode module +=============================================== + +.. automodule:: paddlespeech.audio.streamdata.autodecode + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.dataset.rst b/docs/source/api/paddlespeech.audio.streamdata.cache.rst similarity index 50% rename from docs/source/api/paddlespeech.audio.datasets.dataset.rst rename to docs/source/api/paddlespeech.audio.streamdata.cache.rst index 41243fb73..393055e54 100644 --- a/docs/source/api/paddlespeech.audio.datasets.dataset.rst +++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.datasets.dataset module +paddlespeech.audio.streamdata.cache module ========================================== -.. automodule:: paddlespeech.audio.datasets.dataset +.. automodule:: paddlespeech.audio.streamdata.cache :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst b/docs/source/api/paddlespeech.audio.streamdata.compat.rst similarity index 50% rename from docs/source/api/paddlespeech.audio.datasets.voxceleb.rst rename to docs/source/api/paddlespeech.audio.streamdata.compat.rst index 179053dcd..760695b20 100644 --- a/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst +++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.datasets.voxceleb module +paddlespeech.audio.streamdata.compat module =========================================== -.. automodule:: paddlespeech.audio.datasets.voxceleb +.. automodule:: paddlespeech.audio.streamdata.compat :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst new file mode 100644 index 000000000..74628e963 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.extradatasets module +================================================== + +.. automodule:: paddlespeech.audio.streamdata.extradatasets + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.filters.rst b/docs/source/api/paddlespeech.audio.streamdata.filters.rst new file mode 100644 index 000000000..d26104279 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.filters module +============================================ + +.. automodule:: paddlespeech.audio.streamdata.filters + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.compliance.kaldi.rst b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst similarity index 50% rename from docs/source/api/paddlespeech.audio.compliance.kaldi.rst rename to docs/source/api/paddlespeech.audio.streamdata.gopen.rst index f1459cf1a..1cccb7763 100644 --- a/docs/source/api/paddlespeech.audio.compliance.kaldi.rst +++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.compliance.kaldi module +paddlespeech.audio.streamdata.gopen module ========================================== -.. automodule:: paddlespeech.audio.compliance.kaldi +.. automodule:: paddlespeech.audio.streamdata.gopen :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst new file mode 100644 index 000000000..7a4b3ce8e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.handlers module +============================================= + +.. automodule:: paddlespeech.audio.streamdata.handlers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.mix.rst b/docs/source/api/paddlespeech.audio.streamdata.mix.rst new file mode 100644 index 000000000..908b35dd1 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.mix module +======================================== + +.. automodule:: paddlespeech.audio.streamdata.mix + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst new file mode 100644 index 000000000..203343004 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.paddle\_utils module +================================================== + +.. automodule:: paddlespeech.audio.streamdata.paddle_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst new file mode 100644 index 000000000..ae05fbecc --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.pipeline module +============================================= + +.. automodule:: paddlespeech.audio.streamdata.pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.rst b/docs/source/api/paddlespeech.audio.streamdata.rst new file mode 100644 index 000000000..a1f4560a3 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.rst @@ -0,0 +1,28 @@ +paddlespeech.audio.streamdata package +===================================== + +.. automodule:: paddlespeech.audio.streamdata + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.streamdata.autodecode + paddlespeech.audio.streamdata.cache + paddlespeech.audio.streamdata.compat + paddlespeech.audio.streamdata.extradatasets + paddlespeech.audio.streamdata.filters + paddlespeech.audio.streamdata.gopen + paddlespeech.audio.streamdata.handlers + paddlespeech.audio.streamdata.mix + paddlespeech.audio.streamdata.paddle_utils + paddlespeech.audio.streamdata.pipeline + paddlespeech.audio.streamdata.shardlists + paddlespeech.audio.streamdata.tariterators + paddlespeech.audio.streamdata.utils + paddlespeech.audio.streamdata.writer diff --git a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst new file mode 100644 index 000000000..ec1fe8236 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.shardlists module +=============================================== + +.. automodule:: paddlespeech.audio.streamdata.shardlists + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst new file mode 100644 index 000000000..b003b2d42 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.tariterators module +================================================= + +.. automodule:: paddlespeech.audio.streamdata.tariterators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.utils.rst b/docs/source/api/paddlespeech.audio.streamdata.utils.rst new file mode 100644 index 000000000..f248b1131 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.utils module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.functional.window.rst b/docs/source/api/paddlespeech.audio.streamdata.writer.rst similarity index 50% rename from docs/source/api/paddlespeech.audio.functional.window.rst rename to docs/source/api/paddlespeech.audio.streamdata.writer.rst index 347762751..7437241f3 100644 --- a/docs/source/api/paddlespeech.audio.functional.window.rst +++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.functional.window module +paddlespeech.audio.streamdata.writer module =========================================== -.. automodule:: paddlespeech.audio.functional.window +.. automodule:: paddlespeech.audio.streamdata.writer :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.text.rst b/docs/source/api/paddlespeech.audio.text.rst new file mode 100644 index 000000000..a2018050a --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.rst @@ -0,0 +1,16 @@ +paddlespeech.audio.text package +=============================== + +.. automodule:: paddlespeech.audio.text + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.text.text_featurizer + paddlespeech.audio.text.utility diff --git a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst new file mode 100644 index 000000000..1a8262d08 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.text.text\_featurizer module +=============================================== + +.. automodule:: paddlespeech.audio.text.text_featurizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.text.utility.rst b/docs/source/api/paddlespeech.audio.text.utility.rst new file mode 100644 index 000000000..90fcb25f6 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.utility.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.text.utility module +====================================== + +.. automodule:: paddlespeech.audio.text.utility + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst new file mode 100644 index 000000000..b4b596d6e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.add\_deltas module +=============================================== + +.. automodule:: paddlespeech.audio.transform.add_deltas + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst new file mode 100644 index 000000000..4828b5904 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.channel\_selector module +===================================================== + +.. automodule:: paddlespeech.audio.transform.channel_selector + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.cmvn.rst b/docs/source/api/paddlespeech.audio.transform.cmvn.rst new file mode 100644 index 000000000..44655a1e4 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.cmvn module +======================================== + +.. automodule:: paddlespeech.audio.transform.cmvn + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.functional.rst b/docs/source/api/paddlespeech.audio.transform.functional.rst new file mode 100644 index 000000000..7877d2495 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.functional.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.functional module +============================================== + +.. automodule:: paddlespeech.audio.transform.functional + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.perturb.rst b/docs/source/api/paddlespeech.audio.transform.perturb.rst new file mode 100644 index 000000000..e3615a5d1 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.perturb module +=========================================== + +.. automodule:: paddlespeech.audio.transform.perturb + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.rst b/docs/source/api/paddlespeech.audio.transform.rst new file mode 100644 index 000000000..47a7303b3 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.rst @@ -0,0 +1,24 @@ +paddlespeech.audio.transform package +==================================== + +.. automodule:: paddlespeech.audio.transform + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.transform.add_deltas + paddlespeech.audio.transform.channel_selector + paddlespeech.audio.transform.cmvn + paddlespeech.audio.transform.functional + paddlespeech.audio.transform.perturb + paddlespeech.audio.transform.spec_augment + paddlespeech.audio.transform.spectrogram + paddlespeech.audio.transform.transform_interface + paddlespeech.audio.transform.transformation + paddlespeech.audio.transform.wpe diff --git a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst new file mode 100644 index 000000000..f11a32241 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.spec\_augment module +================================================= + +.. automodule:: paddlespeech.audio.transform.spec_augment + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst new file mode 100644 index 000000000..6be0c32ee --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.spectrogram module +=============================================== + +.. automodule:: paddlespeech.audio.transform.spectrogram + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst new file mode 100644 index 000000000..ec8b20857 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.transform\_interface module +======================================================== + +.. automodule:: paddlespeech.audio.transform.transform_interface + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.transformation.rst b/docs/source/api/paddlespeech.audio.transform.transformation.rst new file mode 100644 index 000000000..94629b9af --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.transformation module +================================================== + +.. automodule:: paddlespeech.audio.transform.transformation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.wpe.rst b/docs/source/api/paddlespeech.audio.transform.wpe.rst new file mode 100644 index 000000000..85c758114 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.wpe module +======================================= + +.. automodule:: paddlespeech.audio.transform.wpe + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst similarity index 50% rename from docs/source/api/paddlespeech.audio.datasets.hey_snips.rst rename to docs/source/api/paddlespeech.audio.utils.check_kwargs.rst index ce08b7003..a18f27e65 100644 --- a/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst +++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst @@ -1,7 +1,7 @@ -paddlespeech.audio.datasets.hey\_snips module +paddlespeech.audio.utils.check\_kwargs module ============================================= -.. automodule:: paddlespeech.audio.datasets.hey_snips +.. automodule:: paddlespeech.audio.utils.check_kwargs :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst new file mode 100644 index 000000000..5d060ee15 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.dynamic\_import module +=============================================== + +.. automodule:: paddlespeech.audio.utils.dynamic_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.rst b/docs/source/api/paddlespeech.audio.utils.rst index db15927da..0f1150ff3 100644 --- a/docs/source/api/paddlespeech.audio.utils.rst +++ b/docs/source/api/paddlespeech.audio.utils.rst @@ -12,8 +12,12 @@ Submodules .. toctree:: :maxdepth: 4 + paddlespeech.audio.utils.check_kwargs paddlespeech.audio.utils.download + paddlespeech.audio.utils.dynamic_import paddlespeech.audio.utils.error paddlespeech.audio.utils.log paddlespeech.audio.utils.numeric + paddlespeech.audio.utils.sox_utils + paddlespeech.audio.utils.tensor_utils paddlespeech.audio.utils.time diff --git a/docs/source/api/paddlespeech.audio.utils.sox_utils.rst b/docs/source/api/paddlespeech.audio.utils.sox_utils.rst new file mode 100644 index 000000000..6fd60b7c3 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.sox_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.sox\_utils module +========================================== + +.. automodule:: paddlespeech.audio.utils.sox_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst new file mode 100644 index 000000000..93a1f70eb --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.tensor\_utils module +============================================= + +.. automodule:: paddlespeech.audio.utils.tensor_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.rst b/docs/source/api/paddlespeech.cls.exps.panns.rst index 6147b245e..da4cfe897 100644 --- a/docs/source/api/paddlespeech.cls.exps.panns.rst +++ b/docs/source/api/paddlespeech.cls.exps.panns.rst @@ -21,5 +21,9 @@ Submodules :maxdepth: 4 paddlespeech.cls.exps.panns.export_model + paddlespeech.cls.exps.panns.panns paddlespeech.cls.exps.panns.predict paddlespeech.cls.exps.panns.train + paddlespeech.cls.exps.panns.u2 + paddlespeech.cls.exps.panns.u2_st + paddlespeech.cls.exps.panns.util diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst new file mode 100644 index 000000000..b533e8c42 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.collate module +========================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.collate + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst new file mode 100644 index 000000000..45e094555 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.compute\_det module +============================================== + +.. automodule:: paddlespeech.kws.exps.mdtc.compute_det + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst new file mode 100644 index 000000000..46a149b0b --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.plot\_det\_curve module +================================================== + +.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst new file mode 100644 index 000000000..f6cad64e3 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst @@ -0,0 +1,19 @@ +paddlespeech.kws.exps.mdtc package +================================== + +.. automodule:: paddlespeech.kws.exps.mdtc + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.kws.exps.mdtc.collate + paddlespeech.kws.exps.mdtc.compute_det + paddlespeech.kws.exps.mdtc.plot_det_curve + paddlespeech.kws.exps.mdtc.score + paddlespeech.kws.exps.mdtc.train diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst new file mode 100644 index 000000000..aa956b4cb --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.score module +======================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.score + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst new file mode 100644 index 000000000..5e4ca401a --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.train module +======================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.rst b/docs/source/api/paddlespeech.kws.exps.rst new file mode 100644 index 000000000..bf10d2c9f --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.rst @@ -0,0 +1,15 @@ +paddlespeech.kws.exps package +============================= + +.. automodule:: paddlespeech.kws.exps + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.kws.exps.mdtc diff --git a/docs/source/api/paddlespeech.kws.rst b/docs/source/api/paddlespeech.kws.rst index c2829a42e..d21d094c7 100644 --- a/docs/source/api/paddlespeech.kws.rst +++ b/docs/source/api/paddlespeech.kws.rst @@ -12,4 +12,5 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.kws.exps paddlespeech.kws.models diff --git a/docs/source/api/paddlespeech.resource.model_alias.rst b/docs/source/api/paddlespeech.resource.model_alias.rst new file mode 100644 index 000000000..b78e643ac --- /dev/null +++ b/docs/source/api/paddlespeech.resource.model_alias.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.model\_alias module +========================================= + +.. automodule:: paddlespeech.resource.model_alias + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.pretrained_models.rst b/docs/source/api/paddlespeech.resource.pretrained_models.rst new file mode 100644 index 000000000..a02061693 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.pretrained\_models module +=============================================== + +.. automodule:: paddlespeech.resource.pretrained_models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.resource.rst b/docs/source/api/paddlespeech.resource.resource.rst new file mode 100644 index 000000000..8b51eda3c --- /dev/null +++ b/docs/source/api/paddlespeech.resource.resource.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.resource module +===================================== + +.. automodule:: paddlespeech.resource.resource + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.rst b/docs/source/api/paddlespeech.resource.rst new file mode 100644 index 000000000..61fdd5317 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.rst @@ -0,0 +1,17 @@ +paddlespeech.resource package +============================= + +.. automodule:: paddlespeech.resource + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.resource.model_alias + paddlespeech.resource.pretrained_models + paddlespeech.resource.resource diff --git a/docs/source/api/paddlespeech.rst b/docs/source/api/paddlespeech.rst index e7a01bf76..70b93ca08 100644 --- a/docs/source/api/paddlespeech.rst +++ b/docs/source/api/paddlespeech.rst @@ -16,8 +16,18 @@ Subpackages paddlespeech.cli paddlespeech.cls paddlespeech.kws + paddlespeech.resource paddlespeech.s2t paddlespeech.server paddlespeech.t2s paddlespeech.text + paddlespeech.utils paddlespeech.vector + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.version diff --git a/docs/source/api/paddlespeech.s2t.rst b/docs/source/api/paddlespeech.s2t.rst index 4be22cb87..be9ef52f5 100644 --- a/docs/source/api/paddlespeech.s2t.rst +++ b/docs/source/api/paddlespeech.s2t.rst @@ -19,5 +19,4 @@ Subpackages paddlespeech.s2t.models paddlespeech.s2t.modules paddlespeech.s2t.training - paddlespeech.s2t.transform paddlespeech.s2t.utils diff --git a/docs/source/api/paddlespeech.server.utils.rst b/docs/source/api/paddlespeech.server.utils.rst index 9d1166392..b4051aee3 100644 --- a/docs/source/api/paddlespeech.server.utils.rst +++ b/docs/source/api/paddlespeech.server.utils.rst @@ -18,7 +18,6 @@ Submodules paddlespeech.server.utils.config paddlespeech.server.utils.errors paddlespeech.server.utils.exception - paddlespeech.server.utils.log paddlespeech.server.utils.onnx_infer paddlespeech.server.utils.paddle_predictor paddlespeech.server.utils.util diff --git a/docs/source/api/paddlespeech.t2s.datasets.rst b/docs/source/api/paddlespeech.t2s.datasets.rst index b40eb2bf1..dfbdb0b47 100644 --- a/docs/source/api/paddlespeech.t2s.datasets.rst +++ b/docs/source/api/paddlespeech.t2s.datasets.rst @@ -19,4 +19,5 @@ Submodules paddlespeech.t2s.datasets.get_feats paddlespeech.t2s.datasets.ljspeech paddlespeech.t2s.datasets.preprocess_utils + paddlespeech.t2s.datasets.sampler paddlespeech.t2s.datasets.vocoder_batch_fn diff --git a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst new file mode 100644 index 000000000..ed29c28d7 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.datasets.sampler module +======================================== + +.. automodule:: paddlespeech.t2s.datasets.sampler + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst new file mode 100644 index 000000000..a5e07aace --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.align module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.align + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst new file mode 100644 index 000000000..3771311cb --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.normalize module +================================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst new file mode 100644 index 000000000..8d4c24ffe --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.preprocess module +================================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst new file mode 100644 index 000000000..a61158420 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst @@ -0,0 +1,21 @@ +paddlespeech.t2s.exps.ernie\_sat package +======================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.exps.ernie_sat.align + paddlespeech.t2s.exps.ernie_sat.normalize + paddlespeech.t2s.exps.ernie_sat.preprocess + paddlespeech.t2s.exps.ernie_sat.synthesize + paddlespeech.t2s.exps.ernie_sat.synthesize_e2e + paddlespeech.t2s.exps.ernie_sat.train + paddlespeech.t2s.exps.ernie_sat.utils diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst new file mode 100644 index 000000000..ecda2a513 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.synthesize module +================================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst new file mode 100644 index 000000000..00fc44952 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module +======================================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst new file mode 100644 index 000000000..ba9a33344 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.train module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst new file mode 100644 index 000000000..a2dd26c38 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.utils module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst index 3c98aa882..fad1fd87f 100644 --- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst +++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst @@ -16,3 +16,4 @@ Submodules paddlespeech.t2s.exps.fastspeech2.normalize paddlespeech.t2s.exps.fastspeech2.preprocess paddlespeech.t2s.exps.fastspeech2.train + paddlespeech.t2s.exps.fastspeech2.vc2_infer diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst new file mode 100644 index 000000000..70a9d6e15 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.fastspeech2.vc2\_infer module +=================================================== + +.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst index a688435eb..bee18a972 100644 --- a/docs/source/api/paddlespeech.t2s.exps.rst +++ b/docs/source/api/paddlespeech.t2s.exps.rst @@ -12,11 +12,13 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.t2s.exps.ernie_sat paddlespeech.t2s.exps.fastspeech2 paddlespeech.t2s.exps.gan_vocoder paddlespeech.t2s.exps.speedyspeech paddlespeech.t2s.exps.tacotron2 paddlespeech.t2s.exps.transformer_tts + paddlespeech.t2s.exps.vits paddlespeech.t2s.exps.waveflow paddlespeech.t2s.exps.wavernn @@ -31,6 +33,7 @@ Submodules paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_streaming + paddlespeech.t2s.exps.stream_play_tts paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize_e2e diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst new file mode 100644 index 000000000..cb22dde0c --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.stream\_play\_tts module +============================================== + +.. automodule:: paddlespeech.t2s.exps.stream_play_tts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst new file mode 100644 index 000000000..c5606f998 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.normalize module +=========================================== + +.. automodule:: paddlespeech.t2s.exps.vits.normalize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst new file mode 100644 index 000000000..50633c621 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.preprocess module +============================================ + +.. automodule:: paddlespeech.t2s.exps.vits.preprocess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.rst b/docs/source/api/paddlespeech.t2s.exps.vits.rst new file mode 100644 index 000000000..51a9418d5 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst @@ -0,0 +1,20 @@ +paddlespeech.t2s.exps.vits package +================================== + +.. automodule:: paddlespeech.t2s.exps.vits + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.exps.vits.normalize + paddlespeech.t2s.exps.vits.preprocess + paddlespeech.t2s.exps.vits.synthesize + paddlespeech.t2s.exps.vits.synthesize_e2e + paddlespeech.t2s.exps.vits.train + paddlespeech.t2s.exps.vits.voice_cloning diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst new file mode 100644 index 000000000..4b22d069a --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.synthesize module +============================================ + +.. automodule:: paddlespeech.t2s.exps.vits.synthesize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst new file mode 100644 index 000000000..053ddfc83 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.synthesize\_e2e module +================================================= + +.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst new file mode 100644 index 000000000..31bd3a48f --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.train module +======================================= + +.. automodule:: paddlespeech.t2s.exps.vits.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst new file mode 100644 index 000000000..d9be0f310 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.voice\_cloning module +================================================ + +.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst new file mode 100644 index 000000000..1635ec284 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.dataset module +============================================= + +.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst new file mode 100644 index 000000000..b7d549070 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.onnx\_api module +=============================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst new file mode 100644 index 000000000..10a118b76 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst @@ -0,0 +1,17 @@ +paddlespeech.t2s.frontend.g2pw package +====================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.frontend.g2pw.dataset + paddlespeech.t2s.frontend.g2pw.onnx_api + paddlespeech.t2s.frontend.g2pw.utils diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst new file mode 100644 index 000000000..ce9428037 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.utils module +=========================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst new file mode 100644 index 000000000..4505dddba --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.mix\_frontend module +============================================== + +.. automodule:: paddlespeech.t2s.frontend.mix_frontend + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.rst index 8fbf1e6eb..b61068616 100644 --- a/docs/source/api/paddlespeech.t2s.frontend.rst +++ b/docs/source/api/paddlespeech.t2s.frontend.rst @@ -12,6 +12,7 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.t2s.frontend.g2pw paddlespeech.t2s.frontend.normalizer paddlespeech.t2s.frontend.zh_normalization @@ -23,6 +24,7 @@ Submodules paddlespeech.t2s.frontend.arpabet paddlespeech.t2s.frontend.generate_lexicon + paddlespeech.t2s.frontend.mix_frontend paddlespeech.t2s.frontend.phonectic paddlespeech.t2s.frontend.punctuation paddlespeech.t2s.frontend.tone_sandhi diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst new file mode 100644 index 000000000..fce5a83cc --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.ernie\_sat.ernie\_sat module +==================================================== + +.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst new file mode 100644 index 000000000..8a697d6cf --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module +============================================================= + +.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst index 680a85dea..aff7489c7 100644 --- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst @@ -12,4 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.t2s.models.ernie_sat.mlm + paddlespeech.t2s.models.ernie_sat.ernie_sat + paddlespeech.t2s.models.ernie_sat.ernie_sat_updater diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst new file mode 100644 index 000000000..7aaba7952 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.vits.monotonic\_align.core module +========================================================= + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst new file mode 100644 index 000000000..25c819a7e --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst @@ -0,0 +1,16 @@ +paddlespeech.t2s.models.vits.monotonic\_align package +===================================================== + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.models.vits.monotonic_align.core + paddlespeech.t2s.models.vits.monotonic_align.setup diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst new file mode 100644 index 000000000..a93c3b8bf --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.vits.monotonic\_align.setup module +========================================================== + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.dynamic_import.rst b/docs/source/api/paddlespeech.utils.dynamic_import.rst new file mode 100644 index 000000000..daa4e6e78 --- /dev/null +++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst @@ -0,0 +1,7 @@ +paddlespeech.utils.dynamic\_import module +========================================= + +.. automodule:: paddlespeech.utils.dynamic_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.env.rst b/docs/source/api/paddlespeech.utils.env.rst new file mode 100644 index 000000000..e51278f82 --- /dev/null +++ b/docs/source/api/paddlespeech.utils.env.rst @@ -0,0 +1,7 @@ +paddlespeech.utils.env module +============================= + +.. automodule:: paddlespeech.utils.env + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.rst b/docs/source/api/paddlespeech.utils.rst new file mode 100644 index 000000000..3d47626bb --- /dev/null +++ b/docs/source/api/paddlespeech.utils.rst @@ -0,0 +1,16 @@ +paddlespeech.utils package +========================== + +.. automodule:: paddlespeech.utils + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.utils.dynamic_import + paddlespeech.utils.env diff --git a/docs/source/api/paddlespeech.version.rst b/docs/source/api/paddlespeech.version.rst new file mode 100644 index 000000000..707c5f886 --- /dev/null +++ b/docs/source/api/paddlespeech.version.rst @@ -0,0 +1,7 @@ +paddlespeech.version module +=========================== + +.. automodule:: paddlespeech.version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cls/custom_dataset.md b/docs/source/cls/custom_dataset.md index e39dcf12d..0b9b050b0 100644 --- a/docs/source/cls/custom_dataset.md +++ b/docs/source/cls/custom_dataset.md @@ -14,7 +14,7 @@ Assuming you have some wave files that stored in your own directory. You should Here is an example to build your custom dataset in `custom_dataset.py`: ```python -from paddlespeech.audio.datasets.dataset import AudioClassificationDataset +from paddleaudio.datasets.dataset import AudioClassificationDataset class CustomDataset(AudioClassificationDataset): meta_file = '/PATH/TO/META_FILE.txt' @@ -48,7 +48,7 @@ class CustomDataset(AudioClassificationDataset): Then you can build dataset and data loader from `CustomDataset`: ```python import paddle -from paddlespeech.audio.features import LogMelSpectrogram +from paddleaudio.features import LogMelSpectrogram from custom_dataset import CustomDataset diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml index 1f0323f0d..3a9d42aa5 100644 --- a/examples/esc50/cls0/conf/panns.yaml +++ b/examples/esc50/cls0/conf/panns.yaml @@ -1,5 +1,5 @@ data: - dataset: 'paddlespeech.audio.datasets:ESC50' + dataset: 'paddleaudio.datasets:ESC50' num_classes: 50 train: mode: 'train' diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml index 54d059472..857d36d46 100644 --- a/examples/hey_snips/kws0/conf/mdtc.yaml +++ b/examples/hey_snips/kws0/conf/mdtc.yaml @@ -2,7 +2,7 @@ ########################################### # Data # ########################################### -dataset: 'paddlespeech.audio.datasets:HeySnips' +dataset: 'paddleaudio.datasets:HeySnips' data_dir: '../tests/hey_snips_research_6k_en_train_eval_clean_ter' ############################################ diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py index e5a5dff7b..03d054004 100644 --- a/examples/voxceleb/sv0/local/data_prepare.py +++ b/examples/voxceleb/sv0/local/data_prepare.py @@ -16,7 +16,7 @@ import argparse import paddle from yacs.config import CfgNode -from paddlespeech.audio.datasets.voxceleb import VoxCeleb +from paddleaudio.datasets.voxceleb import VoxCeleb from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.augment import build_augment_pipeline from paddlespeech.vector.training.seeding import seed_everything diff --git a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py index 233977bae..9aa8a2ebe 100644 --- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py +++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py @@ -23,7 +23,7 @@ from typing import List import tqdm from yacs.config import CfgNode -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio +from paddleaudio.backends import soundfile_load as load_audio from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.utils.vector_utils import get_chunks diff --git a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py index 49c234a43..c39dc66df 100644 --- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py +++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py @@ -24,7 +24,7 @@ import random import tqdm from yacs.config import CfgNode -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio +from paddleaudio.backends import soundfile_load as load_audio from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.utils.vector_utils import get_chunks diff --git a/paddlespeech/audio/backends/no_backend.py b/paddlespeech/audio/backends/no_backend.py deleted file mode 100644 index 157536f46..000000000 --- a/paddlespeech/audio/backends/no_backend.py +++ /dev/null @@ -1,32 +0,0 @@ -from pathlib import Path -from typing import Callable -from typing import Optional -from typing import Tuple -from typing import Union - -from paddle import Tensor - -#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py - - -def load( - filepath: Union[str, Path], - out: Optional[Tensor]=None, - normalization: Union[bool, float, Callable]=True, - channels_first: bool=True, - num_frames: int=0, - offset: int=0, - filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: - raise RuntimeError("No audio I/O backend is available.") - - -def save(filepath: str, - src: Tensor, - sample_rate: int, - precision: int=16, - channels_first: bool=True) -> None: - raise RuntimeError("No audio I/O backend is available.") - - -def info(filepath: str) -> None: - raise RuntimeError("No audio I/O backend is available.") diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py deleted file mode 100644 index 57e06e521..000000000 --- a/paddlespeech/audio/backends/soundfile_backend.py +++ /dev/null @@ -1,662 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import warnings -from typing import Optional -from typing import Tuple - -import numpy as np -import paddle -import resampy -import soundfile -from scipy.io import wavfile - -from ..utils import depth_convert -from ..utils import ParameterError -from .common import AudioMetaData - -__all__ = [ - 'resample', - 'to_mono', - 'normalize', - 'save', - 'soundfile_save', - 'load', - 'soundfile_load', - 'info', - 'to_mono' -] -NORMALMIZE_TYPES = ['linear', 'gaussian'] -MERGE_TYPES = ['ch0', 'ch1', 'random', 'average'] -RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] -EPS = 1e-8 - - -def resample(y: np.ndarray, - src_sr: int, - target_sr: int, - mode: str='kaiser_fast') -> np.ndarray: - """Audio resampling. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - src_sr (int): Source sample rate. - target_sr (int): Target sample rate. - mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. - - Returns: - np.ndarray: `y` resampled to `target_sr` - """ - - if mode == 'kaiser_best': - warnings.warn( - f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \ - we recommend the mode kaiser_fast in large scale audio trainning') - - if not isinstance(y, np.ndarray): - raise ParameterError( - 'Only support numpy np.ndarray, but received y in {type(y)}') - - if mode not in RESAMPLE_MODES: - raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') - - return resampy.resample(y, src_sr, target_sr, filter=mode) - - -def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: - """Convert sterior audio to mono. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'. - - Returns: - np.ndarray: `y` with mono channel. - """ - - if merge_type not in MERGE_TYPES: - raise ParameterError( - f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' - ) - if y.ndim > 2: - raise ParameterError( - f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}') - if y.ndim == 1: # nothing to merge - return y - - if merge_type == 'ch0': - return y[0] - if merge_type == 'ch1': - return y[1] - if merge_type == 'random': - return y[np.random.randint(0, 2)] - - # need to do averaging according to dtype - - if y.dtype == 'float32': - y_out = (y[0] + y[1]) * 0.5 - elif y.dtype == 'int16': - y_out = y.astype('int32') - y_out = (y_out[0] + y_out[1]) // 2 - y_out = np.clip(y_out, np.iinfo(y.dtype).min, - np.iinfo(y.dtype).max).astype(y.dtype) - - elif y.dtype == 'int8': - y_out = y.astype('int16') - y_out = (y_out[0] + y_out[1]) // 2 - y_out = np.clip(y_out, np.iinfo(y.dtype).min, - np.iinfo(y.dtype).max).astype(y.dtype) - else: - raise ParameterError(f'Unsupported dtype: {y.dtype}') - return y_out - - -def soundfile_load_(file: os.PathLike, - offset: Optional[float]=None, - dtype: str='int16', - duration: Optional[int]=None) -> Tuple[np.ndarray, int]: - """Load audio using soundfile library. This function load audio file using libsndfile. - - Args: - file (os.PathLike): File of waveform. - offset (Optional[float], optional): Offset to the start of waveform. Defaults to None. - dtype (str, optional): Data type of waveform. Defaults to 'int16'. - duration (Optional[int], optional): Duration of waveform to read. Defaults to None. - - Returns: - Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. - """ - with soundfile.SoundFile(file) as sf_desc: - sr_native = sf_desc.samplerate - if offset: - sf_desc.seek(int(offset * sr_native)) - if duration is not None: - frame_duration = int(duration * sr_native) - else: - frame_duration = -1 - y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T - - return y, sf_desc.samplerate - - -def normalize(y: np.ndarray, norm_type: str='linear', - mul_factor: float=1.0) -> np.ndarray: - """Normalize an input audio with additional multiplier. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - norm_type (str, optional): Type of normalization. Defaults to 'linear'. - mul_factor (float, optional): Scaling factor. Defaults to 1.0. - - Returns: - np.ndarray: `y` after normalization. - """ - - if norm_type == 'linear': - amax = np.max(np.abs(y)) - factor = 1.0 / (amax + EPS) - y = y * factor * mul_factor - elif norm_type == 'gaussian': - amean = np.mean(y) - astd = np.std(y) - astd = max(astd, EPS) - y = mul_factor * (y - amean) / astd - else: - raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}') - - return y - - -def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None: - """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - sr (int): Sample rate. - file (os.PathLike): Path of auido file to save. - """ - if not file.endswith('.wav'): - raise ParameterError( - f'only .wav file supported, but dst file name is: {file}') - - if sr <= 0: - raise ParameterError( - f'Sample rate should be larger than 0, recieved sr = {sr}') - - if y.dtype not in ['int16', 'int8']: - warnings.warn( - f'input data type is {y.dtype}, will convert data to int16 format before saving' - ) - y_out = depth_convert(y, 'int16') - else: - y_out = y - - wavfile.write(file, sr, y_out) - -def soundfile_load( - file: os.PathLike, - sr: Optional[int]=None, - mono: bool=True, - merge_type: str='average', # ch0,ch1,random,average - normal: bool=True, - norm_type: str='linear', - norm_mul_factor: float=1.0, - offset: float=0.0, - duration: Optional[int]=None, - dtype: str='float32', - resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: - """Load audio file from disk. This function loads audio from disk using using audio beackend. - - Args: - file (os.PathLike): Path of auido file to load. - sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. - mono (bool, optional): Return waveform with mono channel. Defaults to True. - merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. - normal (bool, optional): Waveform normalization. Defaults to True. - norm_type (str, optional): Type of normalization. Defaults to 'linear'. - norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0. - offset (float, optional): Offset to the start of waveform. Defaults to 0.0. - duration (Optional[int], optional): Duration of waveform to read. Defaults to None. - dtype (str, optional): Data type of waveform. Defaults to 'float32'. - resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. - - Returns: - Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. - """ - - y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration) - - if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)): - raise ParameterError(f'audio file {file} looks empty') - - if mono: - y = to_mono(y, merge_type) - - if sr is not None and sr != r: - y = resample(y, r, sr, mode=resample_mode) - r = sr - - if normal: - y = normalize(y, norm_type, norm_mul_factor) - elif dtype in ['int8', 'int16']: - # still need to do normalization, before depth convertion - y = normalize(y, 'linear', 1.0) - - y = depth_convert(y, dtype) - return y, r - -#the code below is form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py - -def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int): - if not encoding: - if not bits_per_sample: - subtype = { - paddle.uint8: "PCM_U8", - paddle.int16: "PCM_16", - paddle.int32: "PCM_32", - paddle.float32: "FLOAT", - paddle.float64: "DOUBLE", - }.get(dtype) - if not subtype: - raise ValueError(f"Unsupported dtype for wav: {dtype}") - return subtype - if bits_per_sample == 8: - return "PCM_U8" - return f"PCM_{bits_per_sample}" - if encoding == "PCM_S": - if not bits_per_sample: - return "PCM_32" - if bits_per_sample == 8: - raise ValueError("wav does not support 8-bit signed PCM encoding.") - return f"PCM_{bits_per_sample}" - if encoding == "PCM_U": - if bits_per_sample in (None, 8): - return "PCM_U8" - raise ValueError("wav only supports 8-bit unsigned PCM encoding.") - if encoding == "PCM_F": - if bits_per_sample in (None, 32): - return "FLOAT" - if bits_per_sample == 64: - return "DOUBLE" - raise ValueError("wav only supports 32/64-bit float PCM encoding.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "ULAW" - raise ValueError("wav only supports 8-bit mu-law encoding.") - if encoding == "ALAW": - if bits_per_sample in (None, 8): - return "ALAW" - raise ValueError("wav only supports 8-bit a-law encoding.") - raise ValueError(f"wav does not support {encoding}.") - - -def _get_subtype_for_sphere(encoding: str, bits_per_sample: int): - if encoding in (None, "PCM_S"): - return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32" - if encoding in ("PCM_U", "PCM_F"): - raise ValueError(f"sph does not support {encoding} encoding.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "ULAW" - raise ValueError("sph only supports 8-bit for mu-law encoding.") - if encoding == "ALAW": - return "ALAW" - raise ValueError(f"sph does not support {encoding}.") - - -def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int): - if format == "wav": - return _get_subtype_for_wav(dtype, encoding, bits_per_sample) - if format == "flac": - if encoding: - raise ValueError("flac does not support encoding.") - if not bits_per_sample: - return "PCM_16" - if bits_per_sample > 24: - raise ValueError("flac does not support bits_per_sample > 24.") - return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}" - if format in ("ogg", "vorbis"): - if encoding or bits_per_sample: - raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.") - return "VORBIS" - if format == "sph": - return _get_subtype_for_sphere(encoding, bits_per_sample) - if format in ("nis", "nist"): - return "PCM_16" - raise ValueError(f"Unsupported format: {format}") - -def save( - filepath: str, - src: paddle.Tensor, - sample_rate: int, - channels_first: bool = True, - compression: Optional[float] = None, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, -): - """Save audio data to file. - - Note: - The formats this function can handle depend on the soundfile installation. - This function is tested on the following formats; - - * WAV - - * 32-bit floating-point - * 32-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer - - * FLAC - * OGG/VORBIS - * SPHERE - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - - Args: - filepath (str or pathlib.Path): Path to audio file. - src (paddle.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - compression (float of None, optional): Not used. - It is here only for interface compatibility reson with "sox_io" backend. - format (str or None, optional): Override the audio format. - When ``filepath`` argument is path-like object, audio format is - inferred from file extension. If the file extension is missing or - different, you can specify the correct format with this argument. - - When ``filepath`` argument is file-like object, - this argument is required. - - Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, - ``"flac"`` and ``"sph"``. - encoding (str or None, optional): Changes the encoding for supported formats. - This argument is effective only for supported formats, sush as - ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are; - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - bits_per_sample (int or None, optional): Changes the bit depth for the - supported formats. - When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``, - you can change the bit depth. - Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. - - Supported formats/encodings/bit depth/compression are: - - ``"wav"`` - - 32-bit floating-point PCM - - 32-bit signed integer PCM - - 24-bit signed integer PCM - - 16-bit signed integer PCM - - 8-bit unsigned integer PCM - - 8-bit mu-law - - 8-bit a-law - - Note: - Default encoding/bit depth is determined by the dtype of - the input Tensor. - - ``"flac"`` - - 8-bit - - 16-bit (default) - - 24-bit - - ``"ogg"``, ``"vorbis"`` - - Doesn't accept changing configuration. - - ``"sph"`` - - 8-bit signed integer PCM - - 16-bit signed integer PCM - - 24-bit signed integer PCM - - 32-bit signed integer PCM (default) - - 8-bit mu-law - - 8-bit a-law - - 16-bit a-law - - 24-bit a-law - - 32-bit a-law - - """ - if src.ndim != 2: - raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.") - if compression is not None: - warnings.warn( - '`save` function of "soundfile" backend does not support "compression" parameter. ' - "The argument is silently ignored." - ) - if hasattr(filepath, "write"): - if format is None: - raise RuntimeError("`format` is required when saving to file object.") - ext = format.lower() - else: - ext = str(filepath).split(".")[-1].lower() - - if bits_per_sample not in (None, 8, 16, 24, 32, 64): - raise ValueError("Invalid bits_per_sample.") - if bits_per_sample == 24: - warnings.warn( - "Saving audio with 24 bits per sample might warp samples near -1. " - "Using 16 bits per sample might be able to avoid this." - ) - subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample) - - # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format, - # so we extend the extensions manually here - if ext in ["nis", "nist", "sph"] and format is None: - format = "NIST" - - if channels_first: - src = src.t() - - soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format) - -_SUBTYPE2DTYPE = { - "PCM_S8": "int8", - "PCM_U8": "uint8", - "PCM_16": "int16", - "PCM_32": "int32", - "FLOAT": "float32", - "DOUBLE": "float64", -} - -def load( - filepath: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[paddle.Tensor, int]: - """Load audio data from file. - - Note: - The formats this function can handle depend on the soundfile installation. - This function is tested on the following formats; - - * WAV - - * 32-bit floating-point - * 32-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer - - * FLAC - * OGG/VORBIS - * SPHERE - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype and the shape of `[channel, time]`. - The samples are normalized to fit in the range of ``[-1.0, 1.0]``. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer and 8-bit unsigned integer (24-bit signed integer is not supported), - by providing ``normalize=False``, this function can return integer Tensor, where the samples - are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor - for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. - - ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - For these formats, this function always returns ``float32`` Tensor with values normalized to - ``[-1.0, 1.0]``. - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend. - - Args: - filepath (path-like object or file-like object): - Source of audio data. - frame_offset (int, optional): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function always return ``float32``, and sample values are - normalized to ``[-1.0, 1.0]``. - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Not used. PySoundFile does not accept format hint. - - Returns: - (paddle.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and normalization is off, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - with soundfile.SoundFile(filepath, "r") as file_: - if file_.format != "WAV" or normalize: - dtype = "float32" - elif file_.subtype not in _SUBTYPE2DTYPE: - raise ValueError(f"Unsupported subtype: {file_.subtype}") - else: - dtype = _SUBTYPE2DTYPE[file_.subtype] - - frames = file_._prepare_read(frame_offset, None, num_frames) - waveform = file_.read(frames, dtype, always_2d=True) - sample_rate = file_.samplerate - - waveform = paddle.to_tensor(waveform) - if channels_first: - waveform = paddle.transpose(waveform, perm=[1,0]) - return waveform, sample_rate - - -# Mapping from soundfile subtype to number of bits per sample. -# This is mostly heuristical and the value is set to 0 when it is irrelevant -# (lossy formats) or when it can't be inferred. -# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard: -# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony, -# the default seems to be 8 bits but it can be compressed further to 4 bits. -# The dict is inspired from -# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94 -_SUBTYPE_TO_BITS_PER_SAMPLE = { - "PCM_S8": 8, # Signed 8 bit data - "PCM_16": 16, # Signed 16 bit data - "PCM_24": 24, # Signed 24 bit data - "PCM_32": 32, # Signed 32 bit data - "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only) - "FLOAT": 32, # 32 bit float data - "DOUBLE": 64, # 64 bit float data - "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types - "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types - "IMA_ADPCM": 0, # IMA ADPCM. - "MS_ADPCM": 0, # Microsoft ADPCM. - "GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate) - "VOX_ADPCM": 0, # OKI / Dialogix ADPCM - "G721_32": 0, # 32kbs G721 ADPCM encoding. - "G723_24": 0, # 24kbs G723 ADPCM encoding. - "G723_40": 0, # 40kbs G723 ADPCM encoding. - "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding. - "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding. - "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding. - "DWVW_N": 0, # N bit Delta Width Variable Word encoding. - "DPCM_8": 8, # 8 bit differential PCM (XI only) - "DPCM_16": 16, # 16 bit differential PCM (XI only) - "VORBIS": 0, # Xiph Vorbis encoding. (lossy) - "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit). - "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit). - "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit). - "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit). -} - -def _get_bit_depth(subtype): - if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE: - warnings.warn( - f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample " - "attribute will be set to 0. If you are seeing this warning, please " - "report by opening an issue on github (after checking for existing/closed ones). " - "You may otherwise ignore this warning." - ) - return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0) - -_SUBTYPE_TO_ENCODING = { - "PCM_S8": "PCM_S", - "PCM_16": "PCM_S", - "PCM_24": "PCM_S", - "PCM_32": "PCM_S", - "PCM_U8": "PCM_U", - "FLOAT": "PCM_F", - "DOUBLE": "PCM_F", - "ULAW": "ULAW", - "ALAW": "ALAW", - "VORBIS": "VORBIS", -} - -def _get_encoding(format: str, subtype: str): - if format == "FLAC": - return "FLAC" - return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN") - -def info(filepath: str, format: Optional[str] = None) -> AudioMetaData: - """Get signal information of an audio file. - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - - Args: - filepath (path-like object or file-like object): - Source of audio data. - format (str or None, optional): - Not used. PySoundFile does not accept format hint. - - Returns: - AudioMetaData: meta data of the given audio. - - """ - sinfo = soundfile.info(filepath) - return AudioMetaData( - sinfo.samplerate, - sinfo.frames, - sinfo.channels, - bits_per_sample=_get_bit_depth(sinfo.subtype), - encoding=_get_encoding(sinfo.format, sinfo.subtype), - ) diff --git a/paddlespeech/audio/compliance/__init__.py b/paddlespeech/audio/compliance/__init__.py deleted file mode 100644 index c08f9ab11..000000000 --- a/paddlespeech/audio/compliance/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from . import kaldi -from . import librosa diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py deleted file mode 100644 index 538be0196..000000000 --- a/paddlespeech/audio/compliance/kaldi.py +++ /dev/null @@ -1,638 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from torchaudio(https://github.com/pytorch/audio) -import math -from typing import Tuple - -import paddle -from paddle import Tensor - -from ..functional import create_dct -from ..functional.window import get_window - -__all__ = [ - 'spectrogram', - 'fbank', - 'mfcc', -] - -# window types -HANNING = 'hann' -HAMMING = 'hamming' -POVEY = 'povey' -RECTANGULAR = 'rect' -BLACKMAN = 'blackman' - - -def _get_epsilon(dtype): - return paddle.to_tensor(1e-07, dtype=dtype) - - -def _next_power_of_2(x: int) -> int: - return 1 if x == 0 else 2**(x - 1).bit_length() - - -def _get_strided(waveform: Tensor, - window_size: int, - window_shift: int, - snip_edges: bool) -> Tensor: - assert waveform.dim() == 1 - num_samples = waveform.shape[0] - - if snip_edges: - if num_samples < window_size: - return paddle.empty((0, 0), dtype=waveform.dtype) - else: - m = 1 + (num_samples - window_size) // window_shift - else: - reversed_waveform = paddle.flip(waveform, [0]) - m = (num_samples + (window_shift // 2)) // window_shift - pad = window_size // 2 - window_shift // 2 - pad_right = reversed_waveform - if pad > 0: - pad_left = reversed_waveform[-pad:] - waveform = paddle.concat((pad_left, waveform, pad_right), axis=0) - else: - waveform = paddle.concat((waveform[-pad:], pad_right), axis=0) - - return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T - - -def _feature_window_function( - window_type: str, - window_size: int, - blackman_coeff: float, - dtype: int, ) -> Tensor: - if window_type == HANNING: - return get_window('hann', window_size, fftbins=False, dtype=dtype) - elif window_type == HAMMING: - return get_window('hamming', window_size, fftbins=False, dtype=dtype) - elif window_type == POVEY: - return get_window( - 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) - elif window_type == RECTANGULAR: - return paddle.ones([window_size], dtype=dtype) - elif window_type == BLACKMAN: - a = 2 * math.pi / (window_size - 1) - window_function = paddle.arange(window_size, dtype=dtype) - return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + - (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function) - ).astype(dtype) - else: - raise Exception('Invalid window type ' + window_type) - - -def _get_log_energy(strided_input: Tensor, epsilon: Tensor, - energy_floor: float) -> Tensor: - log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log() - if energy_floor == 0.0: - return log_energy - return paddle.maximum( - log_energy, - paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype)) - - -def _get_waveform_and_window_properties( - waveform: Tensor, - channel: int, - sr: int, - frame_shift: float, - frame_length: float, - round_to_power_of_two: bool, - preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]: - channel = max(channel, 0) - assert channel < waveform.shape[0], ( - 'Invalid channel {} for size {}'.format(channel, waveform.shape[0])) - waveform = waveform[channel, :] # size (n) - window_shift = int( - sr * frame_shift * - 0.001) # pass frame_shift and frame_length in milliseconds - window_size = int(sr * frame_length * 0.001) - padded_window_size = _next_power_of_2( - window_size) if round_to_power_of_two else window_size - - assert 2 <= window_size <= len(waveform), ( - 'choose a window size {} that is [2, {}]'.format(window_size, - len(waveform))) - assert 0 < window_shift, '`window_shift` must be greater than 0' - assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \ - ' use `round_to_power_of_two` or change `frame_length`' - assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]' - assert sr > 0, '`sr` must be greater than zero' - return waveform, window_shift, window_size, padded_window_size - - -def _get_window(waveform: Tensor, - padded_window_size: int, - window_size: int, - window_shift: int, - window_type: str, - blackman_coeff: float, - snip_edges: bool, - raw_energy: bool, - energy_floor: float, - dither: float, - remove_dc_offset: bool, - preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]: - dtype = waveform.dtype - epsilon = _get_epsilon(dtype) - - # (m, window_size) - strided_input = _get_strided(waveform, window_size, window_shift, - snip_edges) - - if dither != 0.0: - x = paddle.maximum(epsilon, - paddle.rand(strided_input.shape, dtype=dtype)) - rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x) - strided_input = strided_input + rand_gauss * dither - - if remove_dc_offset: - row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1) - strided_input = strided_input - row_means - - if raw_energy: - signal_log_energy = _get_log_energy(strided_input, epsilon, - energy_floor) # (m) - - if preemphasis_coefficient != 0.0: - offset_strided_input = paddle.nn.functional.pad( - strided_input.unsqueeze(0), (1, 0), - data_format='NCL', - mode='replicate').squeeze(0) # (m, window_size + 1) - strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : - -1] - - window_function = _feature_window_function( - window_type, window_size, blackman_coeff, - dtype).unsqueeze(0) # (1, window_size) - strided_input = strided_input * window_function # (m, window_size) - - # (m, padded_window_size) - if padded_window_size != window_size: - padding_right = padded_window_size - window_size - strided_input = paddle.nn.functional.pad( - strided_input.unsqueeze(0), (0, padding_right), - data_format='NCL', - mode='constant', - value=0).squeeze(0) - - if not raw_energy: - signal_log_energy = _get_log_energy(strided_input, epsilon, - energy_floor) # size (m) - - return strided_input, signal_log_energy - - -def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor: - if subtract_mean: - col_means = paddle.mean(tensor, axis=0).unsqueeze(0) - tensor = tensor - col_means - return tensor - - -def spectrogram(waveform: Tensor, - blackman_coeff: float=0.42, - channel: int=-1, - dither: float=0.0, - energy_floor: float=1.0, - frame_length: float=25.0, - frame_shift: float=10.0, - preemphasis_coefficient: float=0.97, - raw_energy: bool=True, - remove_dc_offset: bool=True, - round_to_power_of_two: bool=True, - sr: int=16000, - snip_edges: bool=True, - subtract_mean: bool=False, - window_type: str=POVEY) -> Tensor: - """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. - - Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. - blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. - channel (int, optional): Select the channel of waveform. Defaults to -1. - dither (float, optional): Dithering constant . Defaults to 0.0. - energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. - frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. - frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. - preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. - raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. - remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. - round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input - to FFT. Defaults to True. - sr (int, optional): Sample rate of input waveform. Defaults to 16000. - snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it - is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. - subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. - - Returns: - Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames - depends on frame_length and frame_shift. - """ - dtype = waveform.dtype - epsilon = _get_epsilon(dtype) - - waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( - waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, - preemphasis_coefficient) - - strided_input, signal_log_energy = _get_window( - waveform, padded_window_size, window_size, window_shift, window_type, - blackman_coeff, snip_edges, raw_energy, energy_floor, dither, - remove_dc_offset, preemphasis_coefficient) - - # (m, padded_window_size // 2 + 1, 2) - fft = paddle.fft.rfft(strided_input) - - power_spectrum = paddle.maximum( - fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1) - power_spectrum[:, 0] = signal_log_energy - - power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean) - return power_spectrum - - -def _inverse_mel_scale_scalar(mel_freq: float) -> float: - return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0) - - -def _inverse_mel_scale(mel_freq: Tensor) -> Tensor: - return 700.0 * ((mel_freq / 1127.0).exp() - 1.0) - - -def _mel_scale_scalar(freq: float) -> float: - return 1127.0 * math.log(1.0 + freq / 700.0) - - -def _mel_scale(freq: Tensor) -> Tensor: - return 1127.0 * (1.0 + freq / 700.0).log() - - -def _vtln_warp_freq(vtln_low_cutoff: float, - vtln_high_cutoff: float, - low_freq: float, - high_freq: float, - vtln_warp_factor: float, - freq: Tensor) -> Tensor: - assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq' - assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]' - l = vtln_low_cutoff * max(1.0, vtln_warp_factor) - h = vtln_high_cutoff * min(1.0, vtln_warp_factor) - scale = 1.0 / vtln_warp_factor - Fl = scale * l - Fh = scale * h - assert l > low_freq and h < high_freq - scale_left = (Fl - low_freq) / (l - low_freq) - scale_right = (high_freq - Fh) / (high_freq - h) - res = paddle.empty_like(freq) - - outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \ - | paddle.greater_than(freq, paddle.to_tensor(high_freq)) - before_l = paddle.less_than(freq, paddle.to_tensor(l)) - before_h = paddle.less_than(freq, paddle.to_tensor(h)) - after_h = paddle.greater_equal(freq, paddle.to_tensor(h)) - - res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq) - res[before_h] = scale * freq[before_h] - res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq) - res[outside_low_high_freq] = freq[outside_low_high_freq] - - return res - - -def _vtln_warp_mel_freq(vtln_low_cutoff: float, - vtln_high_cutoff: float, - low_freq, - high_freq: float, - vtln_warp_factor: float, - mel_freq: Tensor) -> Tensor: - return _mel_scale( - _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, - vtln_warp_factor, _inverse_mel_scale(mel_freq))) - - -def _get_mel_banks(num_bins: int, - window_length_padded: int, - sample_freq: float, - low_freq: float, - high_freq: float, - vtln_low: float, - vtln_high: float, - vtln_warp_factor: float) -> Tuple[Tensor, Tensor]: - assert num_bins > 3, 'Must have at least 3 mel bins' - assert window_length_padded % 2 == 0 - num_fft_bins = window_length_padded / 2 - nyquist = 0.5 * sample_freq - - if high_freq <= 0.0: - high_freq += nyquist - - assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \ - ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist)) - - fft_bin_width = sample_freq / window_length_padded - mel_low_freq = _mel_scale_scalar(low_freq) - mel_high_freq = _mel_scale_scalar(high_freq) - - mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) - - if vtln_high < 0.0: - vtln_high += nyquist - - assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and - (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \ - ('Bad values in options: vtln-low {} and vtln-high {}, versus ' - 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) - - bin = paddle.arange(num_bins).unsqueeze(1) - left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) - center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) - right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) - - if vtln_warp_factor != 1.0: - left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, - vtln_warp_factor, left_mel) - center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, - high_freq, vtln_warp_factor, - center_mel) - right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, - high_freq, vtln_warp_factor, right_mel) - - center_freqs = _inverse_mel_scale(center_mel) # (num_bins) - # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) - - # (num_bins, num_fft_bins) - up_slope = (mel - left_mel) / (center_mel - left_mel) - down_slope = (right_mel - mel) / (right_mel - center_mel) - - if vtln_warp_factor == 1.0: - bins = paddle.maximum( - paddle.zeros([1]), paddle.minimum(up_slope, down_slope)) - else: - bins = paddle.zeros_like(up_slope) - up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than( - mel, center_mel) - down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than( - mel, right_mel) - bins[up_idx] = up_slope[up_idx] - bins[down_idx] = down_slope[down_idx] - - return bins, center_freqs - - -def fbank(waveform: Tensor, - blackman_coeff: float=0.42, - channel: int=-1, - dither: float=0.0, - energy_floor: float=1.0, - frame_length: float=25.0, - frame_shift: float=10.0, - high_freq: float=0.0, - htk_compat: bool=False, - low_freq: float=20.0, - n_mels: int=23, - preemphasis_coefficient: float=0.97, - raw_energy: bool=True, - remove_dc_offset: bool=True, - round_to_power_of_two: bool=True, - sr: int=16000, - snip_edges: bool=True, - subtract_mean: bool=False, - use_energy: bool=False, - use_log_fbank: bool=True, - use_power: bool=True, - vtln_high: float=-500.0, - vtln_low: float=100.0, - vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: - """Compute and return filter banks from a waveform. The output is identical to Kaldi's. - - Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. - blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. - channel (int, optional): Select the channel of waveform. Defaults to -1. - dither (float, optional): Dithering constant . Defaults to 0.0. - energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. - frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. - frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. - high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. - htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. - low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. - n_mels (int, optional): Number of output mel bins. Defaults to 23. - preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. - raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. - remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. - round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input - to FFT. Defaults to True. - sr (int, optional): Sample rate of input waveform. Defaults to 16000. - snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it - is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. - subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. - use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True. - use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True. - vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. - vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. - vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. - - Returns: - Tensor: A filter banks tensor with shape `(m, n_mels)`. - """ - dtype = waveform.dtype - - waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( - waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, - preemphasis_coefficient) - - strided_input, signal_log_energy = _get_window( - waveform, padded_window_size, window_size, window_shift, window_type, - blackman_coeff, snip_edges, raw_energy, energy_floor, dither, - remove_dc_offset, preemphasis_coefficient) - - # (m, padded_window_size // 2 + 1) - spectrum = paddle.fft.rfft(strided_input).abs() - if use_power: - spectrum = spectrum.pow(2.) - - # (n_mels, padded_window_size // 2) - mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, - high_freq, vtln_low, vtln_high, vtln_warp) - mel_energies = mel_energies.astype(dtype) - - # (n_mels, padded_window_size // 2 + 1) - mel_energies = paddle.nn.functional.pad( - mel_energies.unsqueeze(0), (0, 1), - data_format='NCL', - mode='constant', - value=0).squeeze(0) - - # (m, n_mels) - mel_energies = paddle.mm(spectrum, mel_energies.T) - if use_log_fbank: - mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log() - - if use_energy: - signal_log_energy = signal_log_energy.unsqueeze(1) - if htk_compat: - mel_energies = paddle.concat( - (mel_energies, signal_log_energy), axis=1) - else: - mel_energies = paddle.concat( - (signal_log_energy, mel_energies), axis=1) - - # (m, n_mels + 1) - mel_energies = _subtract_column_mean(mel_energies, subtract_mean) - return mel_energies - - -def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor: - dct_matrix = create_dct(n_mels, n_mels, 'ortho') - dct_matrix[:, 0] = math.sqrt(1 / float(n_mels)) - dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc) - return dct_matrix - - -def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor: - i = paddle.arange(n_mfcc) - return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i / - cepstral_lifter) - - -def mfcc(waveform: Tensor, - blackman_coeff: float=0.42, - cepstral_lifter: float=22.0, - channel: int=-1, - dither: float=0.0, - energy_floor: float=1.0, - frame_length: float=25.0, - frame_shift: float=10.0, - high_freq: float=0.0, - htk_compat: bool=False, - low_freq: float=20.0, - n_mfcc: int=13, - n_mels: int=23, - preemphasis_coefficient: float=0.97, - raw_energy: bool=True, - remove_dc_offset: bool=True, - round_to_power_of_two: bool=True, - sr: int=16000, - snip_edges: bool=True, - subtract_mean: bool=False, - use_energy: bool=False, - vtln_high: float=-500.0, - vtln_low: float=100.0, - vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: - """Compute and return mel frequency cepstral coefficients from a waveform. The output is - identical to Kaldi's. - - Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. - blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. - cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. - channel (int, optional): Select the channel of waveform. Defaults to -1. - dither (float, optional): Dithering constant . Defaults to 0.0. - energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. - frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. - frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. - high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. - htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. - low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. - n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13. - n_mels (int, optional): Number of output mel bins. Defaults to 23. - preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. - raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. - remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. - round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input - to FFT. Defaults to True. - sr (int, optional): Sample rate of input waveform. Defaults to 16000. - snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it - is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. - subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. - vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. - vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. - vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. - - Returns: - Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`. - """ - assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( - n_mfcc, n_mels) - - dtype = waveform.dtype - - # (m, n_mels + use_energy) - feature = fbank( - waveform=waveform, - blackman_coeff=blackman_coeff, - channel=channel, - dither=dither, - energy_floor=energy_floor, - frame_length=frame_length, - frame_shift=frame_shift, - high_freq=high_freq, - htk_compat=htk_compat, - low_freq=low_freq, - n_mels=n_mels, - preemphasis_coefficient=preemphasis_coefficient, - raw_energy=raw_energy, - remove_dc_offset=remove_dc_offset, - round_to_power_of_two=round_to_power_of_two, - sr=sr, - snip_edges=snip_edges, - subtract_mean=False, - use_energy=use_energy, - use_log_fbank=True, - use_power=True, - vtln_high=vtln_high, - vtln_low=vtln_low, - vtln_warp=vtln_warp, - window_type=window_type) - - if use_energy: - # (m) - signal_log_energy = feature[:, n_mels if htk_compat else 0] - mel_offset = int(not htk_compat) - feature = feature[:, mel_offset:(n_mels + mel_offset)] - - # (n_mels, n_mfcc) - dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype) - - # (m, n_mfcc) - feature = feature.matmul(dct_matrix) - - if cepstral_lifter != 0.0: - # (1, n_mfcc) - lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0) - feature *= lifter_coeffs.astype(dtype=dtype) - - if use_energy: - feature[:, 0] = signal_log_energy - - if htk_compat: - energy = feature[:, 0].unsqueeze(1) # (m, 1) - feature = feature[:, 1:] # (m, n_mfcc - 1) - if not use_energy: - energy *= math.sqrt(2) - - feature = paddle.concat((feature, energy), axis=1) - - feature = _subtract_column_mean(feature, subtract_mean) - return feature diff --git a/paddlespeech/audio/compliance/librosa.py b/paddlespeech/audio/compliance/librosa.py deleted file mode 100644 index 17ad51b41..000000000 --- a/paddlespeech/audio/compliance/librosa.py +++ /dev/null @@ -1,788 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from librosa(https://github.com/librosa/librosa) -import warnings -from typing import List -from typing import Optional -from typing import Union - -import numpy as np -import scipy -from numpy.lib.stride_tricks import as_strided -from scipy import signal - -from ..utils import depth_convert -from ..utils import ParameterError - -__all__ = [ - # dsp - 'stft', - 'mfcc', - 'hz_to_mel', - 'mel_to_hz', - 'mel_frequencies', - 'power_to_db', - 'compute_fbank_matrix', - 'melspectrogram', - 'spectrogram', - 'mu_encode', - 'mu_decode', - # augmentation - 'depth_augment', - 'spect_augment', - 'random_crop1d', - 'random_crop2d', - 'adaptive_spect_augment', -] - - -def _pad_center(data: np.ndarray, size: int, axis: int=-1, - **kwargs) -> np.ndarray: - """Pad an array to a target length along a target axis. - - This differs from `np.pad` by centering the data prior to padding, - analogous to `str.center` - """ - - kwargs.setdefault("mode", "constant") - n = data.shape[axis] - lpad = int((size - n) // 2) - lengths = [(0, 0)] * data.ndim - lengths[axis] = (lpad, int(size - n - lpad)) - - if lpad < 0: - raise ParameterError(("Target size ({size:d}) must be " - "at least input size ({n:d})")) - - return np.pad(data, lengths, **kwargs) - - -def _split_frames(x: np.ndarray, - frame_length: int, - hop_length: int, - axis: int=-1) -> np.ndarray: - """Slice a data array into (overlapping) frames. - - This function is aligned with librosa.frame - """ - - if not isinstance(x, np.ndarray): - raise ParameterError( - f"Input must be of type numpy.ndarray, given type(x)={type(x)}") - - if x.shape[axis] < frame_length: - raise ParameterError(f"Input is too short (n={x.shape[axis]:d})" - f" for frame_length={frame_length:d}") - - if hop_length < 1: - raise ParameterError(f"Invalid hop_length: {hop_length:d}") - - if axis == -1 and not x.flags["F_CONTIGUOUS"]: - warnings.warn(f"librosa.util.frame called with axis={axis} " - "on a non-contiguous input. This will result in a copy.") - x = np.asfortranarray(x) - elif axis == 0 and not x.flags["C_CONTIGUOUS"]: - warnings.warn(f"librosa.util.frame called with axis={axis} " - "on a non-contiguous input. This will result in a copy.") - x = np.ascontiguousarray(x) - - n_frames = 1 + (x.shape[axis] - frame_length) // hop_length - strides = np.asarray(x.strides) - - new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize - - if axis == -1: - shape = list(x.shape)[:-1] + [frame_length, n_frames] - strides = list(strides) + [hop_length * new_stride] - - elif axis == 0: - shape = [n_frames, frame_length] + list(x.shape)[1:] - strides = [hop_length * new_stride] + list(strides) - - else: - raise ParameterError(f"Frame axis={axis} must be either 0 or -1") - - return as_strided(x, shape=shape, strides=strides) - - -def _check_audio(y, mono=True) -> bool: - """Determine whether a variable contains valid audio data. - - The audio y must be a np.ndarray, ether 1-channel or two channel - """ - if not isinstance(y, np.ndarray): - raise ParameterError("Audio data must be of type numpy.ndarray") - if y.ndim > 2: - raise ParameterError( - f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}") - - if mono and y.ndim == 2: - raise ParameterError( - f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}") - - if (mono and len(y) == 0) or (not mono and y.shape[1] < 0): - raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}") - - if not np.issubdtype(y.dtype, np.floating): - raise ParameterError("Audio data must be floating-point") - - if not np.isfinite(y).all(): - raise ParameterError("Audio buffer is not finite everywhere") - - return True - - -def hz_to_mel(frequencies: Union[float, List[float], np.ndarray], - htk: bool=False) -> np.ndarray: - """Convert Hz to Mels. - - Args: - frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz. - htk (bool, optional): Use htk scaling. Defaults to False. - - Returns: - np.ndarray: Frequency in mels. - """ - freq = np.asanyarray(frequencies) - - if htk: - return 2595.0 * np.log10(1.0 + freq / 700.0) - - # Fill in the linear part - f_min = 0.0 - f_sp = 200.0 / 3 - - mels = (freq - f_min) / f_sp - - # Fill in the log-scale part - - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = np.log(6.4) / 27.0 # step size for log region - - if freq.ndim: - # If we have array data, vectorize - log_t = freq >= min_log_hz - mels[log_t] = min_log_mel + \ - np.log(freq[log_t] / min_log_hz) / logstep - elif freq >= min_log_hz: - # If we have scalar data, heck directly - mels = min_log_mel + np.log(freq / min_log_hz) / logstep - - return mels - - -def mel_to_hz(mels: Union[float, List[float], np.ndarray], - htk: int=False) -> np.ndarray: - """Convert mel bin numbers to frequencies. - - Args: - mels (Union[float, List[float], np.ndarray]): Frequency in mels. - htk (bool, optional): Use htk scaling. Defaults to False. - - Returns: - np.ndarray: Frequencies in Hz. - """ - mel_array = np.asanyarray(mels) - - if htk: - return 700.0 * (10.0**(mel_array / 2595.0) - 1.0) - - # Fill in the linear scale - f_min = 0.0 - f_sp = 200.0 / 3 - freqs = f_min + f_sp * mel_array - - # And now the nonlinear scale - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = np.log(6.4) / 27.0 # step size for log region - - if mel_array.ndim: - # If we have vector data, vectorize - log_t = mel_array >= min_log_mel - freqs[log_t] = min_log_hz * \ - np.exp(logstep * (mel_array[log_t] - min_log_mel)) - elif mel_array >= min_log_mel: - # If we have scalar data, check directly - freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel)) - - return freqs - - -def mel_frequencies(n_mels: int=128, - fmin: float=0.0, - fmax: float=11025.0, - htk: bool=False) -> np.ndarray: - """Compute mel frequencies. - - Args: - n_mels (int, optional): Number of mel bins. Defaults to 128. - fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. - fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. - htk (bool, optional): Use htk scaling. Defaults to False. - - Returns: - np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`. - """ - # 'Center freqs' of mel bands - uniformly spaced between limits - min_mel = hz_to_mel(fmin, htk=htk) - max_mel = hz_to_mel(fmax, htk=htk) - - mels = np.linspace(min_mel, max_mel, n_mels) - - return mel_to_hz(mels, htk=htk) - - -def fft_frequencies(sr: int, n_fft: int) -> np.ndarray: - """Compute fourier frequencies. - - Args: - sr (int): Sample rate. - n_fft (int): FFT size. - - Returns: - np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. - """ - return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) - - -def compute_fbank_matrix(sr: int, - n_fft: int, - n_mels: int=128, - fmin: float=0.0, - fmax: Optional[float]=None, - htk: bool=False, - norm: str="slaney", - dtype: type=np.float32) -> np.ndarray: - """Compute fbank matrix. - - Args: - sr (int): Sample rate. - n_fft (int): FFT size. - n_mels (int, optional): Number of mel bins. Defaults to 128. - fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. - fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - htk (bool, optional): Use htk scaling. Defaults to False. - norm (str, optional): Type of normalization. Defaults to "slaney". - dtype (type, optional): Data type. Defaults to np.float32. - - - Returns: - np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. - """ - if norm != "slaney": - raise ParameterError('norm must set to slaney') - - if fmax is None: - fmax = float(sr) / 2 - - # Initialize the weights - n_mels = int(n_mels) - weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) - - # Center freqs of each FFT bin - fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) - - # 'Center freqs' of mel bands - uniformly spaced between limits - mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) - - fdiff = np.diff(mel_f) - ramps = np.subtract.outer(mel_f, fftfreqs) - - for i in range(n_mels): - # lower and upper slopes for all bins - lower = -ramps[i] / fdiff[i] - upper = ramps[i + 2] / fdiff[i + 1] - - # .. then intersect them with each other and zero - weights[i] = np.maximum(0, np.minimum(lower, upper)) - - if norm == "slaney": - # Slaney-style mel is scaled to be approx constant energy per channel - enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) - weights *= enorm[:, np.newaxis] - - # Only check weights if f_mel[0] is positive - if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): - # This means we have an empty channel somewhere - warnings.warn("Empty filters detected in mel frequency basis. " - "Some channels will produce empty responses. " - "Try increasing your sampling rate (and fmax) or " - "reducing n_mels.") - - return weights - - -def stft(x: np.ndarray, - n_fft: int=2048, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str="hann", - center: bool=True, - dtype: type=np.complex64, - pad_mode: str="reflect") -> np.ndarray: - """Short-time Fourier transform (STFT). - - Args: - x (np.ndarray): Input waveform in one dimension. - n_fft (int, optional): FFT size. Defaults to 2048. - hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None. - win_length (Optional[int], optional): The size of window. Defaults to None. - window (str, optional): A string of window specification. Defaults to "hann". - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - dtype (type, optional): Data type of STFT results. Defaults to np.complex64. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". - - Returns: - np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`. - """ - _check_audio(x) - - # By default, use the entire frame - if win_length is None: - win_length = n_fft - - # Set the default hop, if it's not already specified - if hop_length is None: - hop_length = int(win_length // 4) - - fft_window = signal.get_window(window, win_length, fftbins=True) - - # Pad the window out to n_fft size - fft_window = _pad_center(fft_window, n_fft) - - # Reshape so that the window can be broadcast - fft_window = fft_window.reshape((-1, 1)) - - # Pad the time series so that frames are centered - if center: - if n_fft > x.shape[-1]: - warnings.warn( - f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" - ) - x = np.pad(x, int(n_fft // 2), mode=pad_mode) - - elif n_fft > x.shape[-1]: - raise ParameterError( - f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" - ) - - # Window the time series. - x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length) - # Pre-allocate the STFT matrix - stft_matrix = np.empty( - (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F") - fft = np.fft # use numpy fft as default - # Constrain STFT block sizes to 256 KB - MAX_MEM_BLOCK = 2**8 * 2**10 - # how many columns can we fit within MAX_MEM_BLOCK? - n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize) - n_columns = max(n_columns, 1) - - for bl_s in range(0, stft_matrix.shape[1], n_columns): - bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) - stft_matrix[:, bl_s:bl_t] = fft.rfft( - fft_window * x_frames[:, bl_s:bl_t], axis=0) - - return stft_matrix - - -def power_to_db(spect: np.ndarray, - ref: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=80.0) -> np.ndarray: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. - - Args: - spect (np.ndarray): STFT power spectrogram of an input waveform. - ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. - amin (float, optional): Minimum threshold. Defaults to 1e-10. - top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0. - - Returns: - np.ndarray: Power spectrogram in db scale. - """ - spect = np.asarray(spect) - - if amin <= 0: - raise ParameterError("amin must be strictly positive") - - if np.issubdtype(spect.dtype, np.complexfloating): - warnings.warn( - "power_to_db was called on complex input so phase " - "information will be discarded. To suppress this warning, " - "call power_to_db(np.abs(D)**2) instead.") - magnitude = np.abs(spect) - else: - magnitude = spect - - if callable(ref): - # User supplied a function to calculate reference power - ref_value = ref(magnitude) - else: - ref_value = np.abs(ref) - - log_spec = 10.0 * np.log10(np.maximum(amin, magnitude)) - log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) - - if top_db is not None: - if top_db < 0: - raise ParameterError("top_db must be non-negative") - log_spec = np.maximum(log_spec, log_spec.max() - top_db) - - return log_spec - - -def mfcc(x: np.ndarray, - sr: int=16000, - spect: Optional[np.ndarray]=None, - n_mfcc: int=20, - dct_type: int=2, - norm: str="ortho", - lifter: int=0, - **kwargs) -> np.ndarray: - """Mel-frequency cepstral coefficients (MFCCs) - - Args: - x (np.ndarray): Input waveform in one dimension. - sr (int, optional): Sample rate. Defaults to 16000. - spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None. - n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20. - dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2. - norm (str, optional): Type of normalization. Defaults to "ortho". - lifter (int, optional): Cepstral filtering. Defaults to 0. - - Returns: - np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`. - """ - if spect is None: - spect = melspectrogram(x, sr=sr, **kwargs) - - M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc] - - if lifter > 0: - factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) / - lifter) - return M * factor[:, np.newaxis] - elif lifter == 0: - return M - else: - raise ParameterError( - f"MFCC lifter={lifter} must be a non-negative number") - - -def melspectrogram(x: np.ndarray, - sr: int=16000, - window_size: int=512, - hop_length: int=320, - n_mels: int=64, - fmin: float=50.0, - fmax: Optional[float]=None, - window: str='hann', - center: bool=True, - pad_mode: str='reflect', - power: float=2.0, - to_db: bool=True, - ref: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=None) -> np.ndarray: - """Compute mel-spectrogram. - - Args: - x (np.ndarray): Input waveform in one dimension. - sr (int, optional): Sample rate. Defaults to 16000. - window_size (int, optional): Size of FFT and window length. Defaults to 512. - hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. - n_mels (int, optional): Number of mel bins. Defaults to 64. - fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0. - fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - window (str, optional): A string of window specification. Defaults to "hann". - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". - power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. - to_db (bool, optional): Enable db scale. Defaults to True. - ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. - amin (float, optional): Minimum threshold. Defaults to 1e-10. - top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. - - Returns: - np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`. - """ - _check_audio(x, mono=True) - if len(x) <= 0: - raise ParameterError('The input waveform is empty') - - if fmax is None: - fmax = sr // 2 - if fmin < 0 or fmin >= fmax: - raise ParameterError('fmin and fmax must statisfy 0 np.ndarray: - """Compute spectrogram. - - Args: - x (np.ndarray): Input waveform in one dimension. - sr (int, optional): Sample rate. Defaults to 16000. - window_size (int, optional): Size of FFT and window length. Defaults to 512. - hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. - window (str, optional): A string of window specification. Defaults to "hann". - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". - power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. - - Returns: - np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`. - """ - - s = stft( - x, - n_fft=window_size, - hop_length=hop_length, - win_length=window_size, - window=window, - center=center, - pad_mode=pad_mode) - - return np.abs(s)**power - - -def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: - """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`. - - Args: - x (np.ndarray): The input waveform to encode. - mu (int, optional): The endoceding parameter. Defaults to 255. - quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True. - - Returns: - np.ndarray: The mu-law encoded waveform. - """ - mu = 255 - y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu) - if quantized: - y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1] - return y - - -def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: - """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise. - - Args: - y (np.ndarray): The encoded waveform. - mu (int, optional): The endoceding parameter. Defaults to 255. - quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True. - - Returns: - np.ndarray: The mu-law decoded waveform. - """ - if mu < 1: - raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...') - - mu = mu - 1 - if quantized: # undo the quantization - y = y * 2 / mu - 1 - x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1) - return x - - -def _randint(high: int) -> int: - """Generate one random integer in range [0 high) - - This is a helper function for random data augmentaiton - """ - return int(np.random.randint(0, high=high)) - - -def depth_augment(y: np.ndarray, - choices: List=['int8', 'int16'], - probs: List[float]=[0.5, 0.5]) -> np.ndarray: - """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16']. - probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5]. - - Returns: - np.ndarray: The augmented waveform. - """ - assert len(probs) == len( - choices - ), 'number of choices {} must be equal to size of probs {}'.format( - len(choices), len(probs)) - depth = np.random.choice(choices, p=probs) - src_depth = y.dtype - y1 = depth_convert(y, depth) - y2 = depth_convert(y1, src_depth) - - return y2 - - -def adaptive_spect_augment(spect: np.ndarray, - tempo_axis: int=0, - level: float=0.1) -> np.ndarray: - """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation. - - Args: - spect (np.ndarray): Input spectrogram. - tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. - level (float, optional): The level factor of masking. Defaults to 0.1. - - Returns: - np.ndarray: The augmented spectrogram. - """ - assert spect.ndim == 2., 'only supports 2d tensor or numpy array' - if tempo_axis == 0: - nt, nf = spect.shape - else: - nf, nt = spect.shape - - time_mask_width = int(nt * level * 0.5) - freq_mask_width = int(nf * level * 0.5) - - num_time_mask = int(10 * level) - num_freq_mask = int(10 * level) - - if tempo_axis == 0: - for _ in range(num_time_mask): - start = _randint(nt - time_mask_width) - spect[start:start + time_mask_width, :] = 0 - for _ in range(num_freq_mask): - start = _randint(nf - freq_mask_width) - spect[:, start:start + freq_mask_width] = 0 - else: - for _ in range(num_time_mask): - start = _randint(nt - time_mask_width) - spect[:, start:start + time_mask_width] = 0 - for _ in range(num_freq_mask): - start = _randint(nf - freq_mask_width) - spect[start:start + freq_mask_width, :] = 0 - - return spect - - -def spect_augment(spect: np.ndarray, - tempo_axis: int=0, - max_time_mask: int=3, - max_freq_mask: int=3, - max_time_mask_width: int=30, - max_freq_mask_width: int=20) -> np.ndarray: - """Do spectrogram augmentation in both time and freq axis. - - Args: - spect (np.ndarray): Input spectrogram. - tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. - max_time_mask (int, optional): Maximum number of time masking. Defaults to 3. - max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3. - max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30. - max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20. - - Returns: - np.ndarray: The augmented spectrogram. - """ - assert spect.ndim == 2., 'only supports 2d tensor or numpy array' - if tempo_axis == 0: - nt, nf = spect.shape - else: - nf, nt = spect.shape - - num_time_mask = _randint(max_time_mask) - num_freq_mask = _randint(max_freq_mask) - - time_mask_width = _randint(max_time_mask_width) - freq_mask_width = _randint(max_freq_mask_width) - - if tempo_axis == 0: - for _ in range(num_time_mask): - start = _randint(nt - time_mask_width) - spect[start:start + time_mask_width, :] = 0 - for _ in range(num_freq_mask): - start = _randint(nf - freq_mask_width) - spect[:, start:start + freq_mask_width] = 0 - else: - for _ in range(num_time_mask): - start = _randint(nt - time_mask_width) - spect[:, start:start + time_mask_width] = 0 - for _ in range(num_freq_mask): - start = _randint(nf - freq_mask_width) - spect[start:start + freq_mask_width, :] = 0 - - return spect - - -def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray: - """ Random cropping on a input waveform. - - Args: - y (np.ndarray): Input waveform array in 1D. - crop_len (int): Length of waveform to crop. - - Returns: - np.ndarray: The cropped waveform. - """ - if y.ndim != 1: - 'only accept 1d tensor or numpy array' - n = len(y) - idx = _randint(n - crop_len) - return y[idx:idx + crop_len] - - -def random_crop2d(s: np.ndarray, crop_len: int, - tempo_axis: int=0) -> np.ndarray: - """ Random cropping on a spectrogram. - - Args: - s (np.ndarray): Input spectrogram in 2D. - crop_len (int): Length of spectrogram to crop. - tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. - - Returns: - np.ndarray: The cropped spectrogram. - """ - if tempo_axis >= s.ndim: - raise ParameterError('axis out of range') - - n = s.shape[tempo_axis] - idx = _randint(high=n - crop_len) - sli = [slice(None) for i in range(s.ndim)] - sli[tempo_axis] = slice(idx, idx + crop_len) - out = s[tuple(sli)] - return out diff --git a/paddlespeech/audio/datasets/__init__.py b/paddlespeech/audio/datasets/__init__.py deleted file mode 100644 index f95fad305..000000000 --- a/paddlespeech/audio/datasets/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .esc50 import ESC50 -from .gtzan import GTZAN -from .hey_snips import HeySnips -from .rirs_noises import OpenRIRNoise -from .tess import TESS -from .urban_sound import UrbanSound8K -from .voxceleb import VoxCeleb diff --git a/paddlespeech/audio/datasets/dataset.py b/paddlespeech/audio/datasets/dataset.py deleted file mode 100644 index 81e6bdf5e..000000000 --- a/paddlespeech/audio/datasets/dataset.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import List - -import numpy as np -import paddle - -from ..compliance.kaldi import fbank as kaldi_fbank -from ..compliance.kaldi import mfcc as kaldi_mfcc -from ..compliance.librosa import melspectrogram -from ..compliance.librosa import mfcc - -feat_funcs = { - 'raw': None, - 'melspectrogram': melspectrogram, - 'mfcc': mfcc, - 'kaldi_fbank': kaldi_fbank, - 'kaldi_mfcc': kaldi_mfcc, -} - - -class AudioClassificationDataset(paddle.io.Dataset): - """ - Base class of audio classification dataset. - """ - - def __init__(self, - files: List[str], - labels: List[int], - feat_type: str='raw', - sample_rate: int=None, - **kwargs): - """ - Ags: - files (:obj:`List[str]`): A list of absolute path of audio files. - labels (:obj:`List[int]`): Labels of audio files. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - super(AudioClassificationDataset, self).__init__() - - if feat_type not in feat_funcs.keys(): - raise RuntimeError( - f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}" - ) - - self.files = files - self.labels = labels - - self.feat_type = feat_type - self.sample_rate = sample_rate - self.feat_config = kwargs # Pass keyword arguments to customize feature config - - def _get_data(self, input_file: str): - raise NotImplementedError - - def _convert_to_record(self, idx): - file, label = self.files[idx], self.labels[idx] - - if self.sample_rate is None: - waveform, sample_rate = paddlespeech.audio.load(file) - else: - waveform, sample_rate = paddlespeech.audio.load( - file, sr=self.sample_rate) - - feat_func = feat_funcs[self.feat_type] - - record = {} - if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: - waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T) - record['feat'] = feat_func( - waveform=waveform, sr=self.sample_rate, **self.feat_config) - else: - record['feat'] = feat_func( - waveform, sample_rate, - **self.feat_config) if feat_func else waveform - record['label'] = label - return record - - def __getitem__(self, idx): - record = self._convert_to_record(idx) - if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: - return self.keys[idx], record['feat'], record['label'] - else: - return np.array(record['feat']).transpose(), np.array( - record['label'], dtype=np.int64) - - def __len__(self): - return len(self.files) diff --git a/paddlespeech/audio/datasets/esc50.py b/paddlespeech/audio/datasets/esc50.py deleted file mode 100644 index f5c7050f3..000000000 --- a/paddlespeech/audio/datasets/esc50.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -from typing import List -from typing import Tuple - -from ..utils import DATA_HOME -from ..utils.download import download_and_decompress -from .dataset import AudioClassificationDataset - -__all__ = ['ESC50'] - - -class ESC50(AudioClassificationDataset): - """ - The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings - suitable for benchmarking methods of environmental sound classification. The dataset - consists of 5-second-long recordings organized into 50 semantical classes (with - 40 examples per class) - - Reference: - ESC: Dataset for Environmental Sound Classification - http://dx.doi.org/10.1145/2733373.2806390 - """ - - archieves = [ - { - 'url': - 'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip', - 'md5': '7771e4b9d86d0945acce719c7a59305a', - }, - ] - label_list = [ - # Animals - 'Dog', - 'Rooster', - 'Pig', - 'Cow', - 'Frog', - 'Cat', - 'Hen', - 'Insects (flying)', - 'Sheep', - 'Crow', - # Natural soundscapes & water sounds - 'Rain', - 'Sea waves', - 'Crackling fire', - 'Crickets', - 'Chirping birds', - 'Water drops', - 'Wind', - 'Pouring water', - 'Toilet flush', - 'Thunderstorm', - # Human, non-speech sounds - 'Crying baby', - 'Sneezing', - 'Clapping', - 'Breathing', - 'Coughing', - 'Footsteps', - 'Laughing', - 'Brushing teeth', - 'Snoring', - 'Drinking, sipping', - # Interior/domestic sounds - 'Door knock', - 'Mouse click', - 'Keyboard typing', - 'Door, wood creaks', - 'Can opening', - 'Washing machine', - 'Vacuum cleaner', - 'Clock alarm', - 'Clock tick', - 'Glass breaking', - # Exterior/urban noises - 'Helicopter', - 'Chainsaw', - 'Siren', - 'Car horn', - 'Engine', - 'Train', - 'Church bells', - 'Airplane', - 'Fireworks', - 'Hand saw', - ] - meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv') - meta_info = collections.namedtuple( - 'META_INFO', - ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take')) - audio_path = os.path.join('ESC-50-master', 'audio') - - def __init__(self, - mode: str='train', - split: int=1, - feat_type: str='raw', - **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - split (:obj:`int`, `optional`, defaults to 1): - It specify the fold of dev dataset. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - files, labels = self._get_data(mode, split) - super(ESC50, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self) -> List[collections.namedtuple]: - ret = [] - with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: - for line in rf.readlines()[1:]: - ret.append(self.meta_info(*line.strip().split(','))) - return ret - - def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, DATA_HOME) - - meta_info = self._get_meta_info() - - files = [] - labels = [] - for sample in meta_info: - filename, fold, target, _, _, _, _ = sample - if mode == 'train' and int(fold) != split: - files.append(os.path.join(DATA_HOME, self.audio_path, filename)) - labels.append(int(target)) - - if mode != 'train' and int(fold) == split: - files.append(os.path.join(DATA_HOME, self.audio_path, filename)) - labels.append(int(target)) - - return files, labels diff --git a/paddlespeech/audio/datasets/gtzan.py b/paddlespeech/audio/datasets/gtzan.py deleted file mode 100644 index 1f6835a5a..000000000 --- a/paddlespeech/audio/datasets/gtzan.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -import random -from typing import List -from typing import Tuple - -from ..utils import DATA_HOME -from ..utils.download import download_and_decompress -from .dataset import AudioClassificationDataset - -__all__ = ['GTZAN'] - - -class GTZAN(AudioClassificationDataset): - """ - The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres, - each represented by 100 tracks. The dataset is the most-used public dataset for evaluation - in machine listening research for music genre recognition (MGR). - - Reference: - Musical genre classification of audio signals - https://ieeexplore.ieee.org/document/1021072/ - """ - - archieves = [ - { - 'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz', - 'md5': '5b3d6dddb579ab49814ab86dba69e7c7', - }, - ] - label_list = [ - 'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', - 'pop', 'reggae', 'rock' - ] - meta = os.path.join('genres', 'input.mf') - meta_info = collections.namedtuple('META_INFO', ('file_path', 'label')) - audio_path = 'genres' - - def __init__(self, - mode='train', - seed=0, - n_folds=5, - split=1, - feat_type='raw', - **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - seed (:obj:`int`, `optional`, defaults to 0): - Set the random seed to shuffle samples. - n_folds (:obj:`int`, `optional`, defaults to 5): - Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. - split (:obj:`int`, `optional`, defaults to 1): - It specify the fold of dev dataset. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' - files, labels = self._get_data(mode, seed, n_folds, split) - super(GTZAN, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self) -> List[collections.namedtuple]: - ret = [] - with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: - for line in rf.readlines(): - ret.append(self.meta_info(*line.strip().split('\t'))) - return ret - - def _get_data(self, mode, seed, n_folds, - split) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, DATA_HOME) - - meta_info = self._get_meta_info() - random.seed(seed) # shuffle samples to split data - random.shuffle( - meta_info - ) # make sure using the same seed to create train and dev dataset - - files = [] - labels = [] - n_samples_per_fold = len(meta_info) // n_folds - for idx, sample in enumerate(meta_info): - file_path, label = sample - filename = os.path.basename(file_path) - target = self.label_list.index(label) - fold = idx // n_samples_per_fold + 1 - - if mode == 'train' and int(fold) != split: - files.append( - os.path.join(DATA_HOME, self.audio_path, label, filename)) - labels.append(target) - - if mode != 'train' and int(fold) == split: - files.append( - os.path.join(DATA_HOME, self.audio_path, label, filename)) - labels.append(target) - - return files, labels diff --git a/paddlespeech/audio/datasets/hey_snips.py b/paddlespeech/audio/datasets/hey_snips.py deleted file mode 100644 index 7a67b843b..000000000 --- a/paddlespeech/audio/datasets/hey_snips.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import json -import os -from typing import List -from typing import Tuple - -from .dataset import AudioClassificationDataset - -__all__ = ['HeySnips'] - - -class HeySnips(AudioClassificationDataset): - meta_info = collections.namedtuple('META_INFO', - ('key', 'label', 'duration', 'wav')) - - def __init__(self, - data_dir: os.PathLike, - mode: str='train', - feat_type: str='kaldi_fbank', - sample_rate: int=16000, - **kwargs): - self.data_dir = data_dir - files, labels = self._get_data(mode) - super(HeySnips, self).__init__( - files=files, - labels=labels, - feat_type=feat_type, - sample_rate=sample_rate, - **kwargs) - - def _get_meta_info(self, mode) -> List[collections.namedtuple]: - ret = [] - with open(os.path.join(self.data_dir, '{}.json'.format(mode)), - 'r') as f: - data = json.load(f) - for item in data: - sample = collections.OrderedDict() - if item['duration'] > 0: - sample['key'] = item['id'] - sample['label'] = 0 if item['is_hotword'] == 1 else -1 - sample['duration'] = item['duration'] - sample['wav'] = os.path.join(self.data_dir, - item['audio_file_path']) - ret.append(self.meta_info(*sample.values())) - return ret - - def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: - meta_info = self._get_meta_info(mode) - - files = [] - labels = [] - self.keys = [] - self.durations = [] - for sample in meta_info: - key, target, duration, wav = sample - files.append(wav) - labels.append(int(target)) - self.keys.append(key) - self.durations.append(float(duration)) - - return files, labels diff --git a/paddlespeech/audio/datasets/rirs_noises.py b/paddlespeech/audio/datasets/rirs_noises.py deleted file mode 100644 index 61bbf72a2..000000000 --- a/paddlespeech/audio/datasets/rirs_noises.py +++ /dev/null @@ -1,200 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import csv -import os -import random -from typing import List - -from paddle.io import Dataset -from tqdm import tqdm - -from ..utils import DATA_HOME -from ..utils.download import download_and_decompress -from .dataset import feat_funcs - -__all__ = ['OpenRIRNoise'] - - -class OpenRIRNoise(Dataset): - archieves = [ - { - 'url': 'http://www.openslr.org/resources/28/rirs_noises.zip', - 'md5': 'e6f48e257286e05de56413b4779d8ffb', - }, - ] - - sample_rate = 16000 - meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav')) - base_path = os.path.join(DATA_HOME, 'open_rir_noise') - wav_path = os.path.join(base_path, 'RIRS_NOISES') - csv_path = os.path.join(base_path, 'csv') - subsets = ['rir', 'noise'] - - def __init__(self, - subset: str='rir', - feat_type: str='raw', - target_dir=None, - random_chunk: bool=True, - chunk_duration: float=3.0, - seed: int=0, - **kwargs): - - assert subset in self.subsets, \ - 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) - - self.subset = subset - self.feat_type = feat_type - self.feat_config = kwargs - self.random_chunk = random_chunk - self.chunk_duration = chunk_duration - - OpenRIRNoise.csv_path = os.path.join( - target_dir, "open_rir_noise", - "csv") if target_dir else self.csv_path - self._data = self._get_data() - super(OpenRIRNoise, self).__init__() - - # Set up a seed to reproduce training or predicting result. - # random.seed(seed) - - def _get_data(self): - # Download audio files. - print(f"rirs noises base path: {self.base_path}") - if not os.path.isdir(self.base_path): - download_and_decompress( - self.archieves, self.base_path, decompress=True) - else: - print( - f"{self.base_path} already exists, we will not download and decompress again" - ) - - # Data preparation. - print(f"prepare the csv to {self.csv_path}") - if not os.path.isdir(self.csv_path): - os.makedirs(self.csv_path) - self.prepare_data() - - data = [] - with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: - for line in rf.readlines()[1:]: - audio_id, duration, wav = line.strip().split(',') - data.append(self.meta_info(audio_id, float(duration), wav)) - - random.shuffle(data) - return data - - def _convert_to_record(self, idx: int): - sample = self._data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = paddlespeech.audio.load(record['wav']) - - assert self.feat_type in feat_funcs.keys(), \ - f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" - feat_func = feat_funcs[self.feat_type] - feat = feat_func( - waveform, sr=sr, **self.feat_config) if feat_func else waveform - - record.update({'feat': feat}) - return record - - @staticmethod - def _get_chunks(seg_dur, audio_id, audio_duration): - num_chunks = int(audio_duration / seg_dur) # all in milliseconds - - chunk_lst = [ - audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) - for i in range(num_chunks) - ] - return chunk_lst - - def _get_audio_info(self, wav_file: str, - split_chunks: bool) -> List[List[str]]: - waveform, sr = paddlespeech.audio.load(wav_file) - audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0] - audio_duration = waveform.shape[0] / sr - - ret = [] - if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds. - uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, - audio_duration) - - for idx, chunk in enumerate(uniq_chunks_list): - s, e = chunk.split("_")[-2:] # Timestamps of start and end - start_sample = int(float(s) * sr) - end_sample = int(float(e) * sr) - new_wav_file = os.path.join(self.base_path, - audio_id + f'_chunk_{idx+1:02}.wav') - paddlespeech.audio.save(waveform[start_sample:end_sample], sr, - new_wav_file) - # id, duration, new_wav - ret.append([chunk, self.chunk_duration, new_wav_file]) - else: # Keep whole audio. - ret.append([audio_id, audio_duration, wav_file]) - return ret - - def generate_csv(self, - wav_files: List[str], - output_file: str, - split_chunks: bool=True): - print(f'Generating csv: {output_file}') - header = ["id", "duration", "wav"] - - infos = list( - tqdm( - map(self._get_audio_info, wav_files, [split_chunks] * len( - wav_files)), - total=len(wav_files))) - - csv_lines = [] - for info in infos: - csv_lines.extend(info) - - with open(output_file, mode="w") as csv_f: - csv_writer = csv.writer( - csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) - csv_writer.writerow(header) - for line in csv_lines: - csv_writer.writerow(line) - - def prepare_data(self): - rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises", - "rir_list") - rir_files = [] - with open(rir_list, 'r') as f: - for line in f.readlines(): - rir_file = line.strip().split(' ')[-1] - rir_files.append(os.path.join(self.base_path, rir_file)) - - noise_list = os.path.join(self.wav_path, "pointsource_noises", - "noise_list") - noise_files = [] - with open(noise_list, 'r') as f: - for line in f.readlines(): - noise_file = line.strip().split(' ')[-1] - noise_files.append(os.path.join(self.base_path, noise_file)) - - self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv')) - self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv')) - - def __getitem__(self, idx): - return self._convert_to_record(idx) - - def __len__(self): - return len(self._data) diff --git a/paddlespeech/audio/datasets/tess.py b/paddlespeech/audio/datasets/tess.py deleted file mode 100644 index 1469fa5e2..000000000 --- a/paddlespeech/audio/datasets/tess.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -import random -from typing import List -from typing import Tuple - -from ..utils import DATA_HOME -from ..utils.download import download_and_decompress -from .dataset import AudioClassificationDataset - -__all__ = ['TESS'] - - -class TESS(AudioClassificationDataset): - """ - TESS is a set of 200 target words were spoken in the carrier phrase - "Say the word _____' by two actresses (aged 26 and 64 years) and - recordings were made of the set portraying each of seven emotions(anger, - disgust, fear, happiness, pleasant surprise, sadness, and neutral). - There are 2800 stimuli in total. - - Reference: - Toronto emotional speech set (TESS) - https://doi.org/10.5683/SP2/E8H2MF - """ - - archieves = [ - { - 'url': - 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', - 'md5': - '1465311b24d1de704c4c63e4ccc470c7', - }, - ] - label_list = [ - 'angry', - 'disgust', - 'fear', - 'happy', - 'neutral', - 'ps', # pleasant surprise - 'sad', - ] - meta_info = collections.namedtuple('META_INFO', - ('speaker', 'word', 'emotion')) - audio_path = 'TESS_Toronto_emotional_speech_set' - - def __init__(self, - mode='train', - seed=0, - n_folds=5, - split=1, - feat_type='raw', - **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - seed (:obj:`int`, `optional`, defaults to 0): - Set the random seed to shuffle samples. - n_folds (:obj:`int`, `optional`, defaults to 5): - Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. - split (:obj:`int`, `optional`, defaults to 1): - It specify the fold of dev dataset. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' - files, labels = self._get_data(mode, seed, n_folds, split) - super(TESS, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, files) -> List[collections.namedtuple]: - ret = [] - for file in files: - basename_without_extend = os.path.basename(file)[:-4] - ret.append(self.meta_info(*basename_without_extend.split('_'))) - return ret - - def _get_data(self, mode, seed, n_folds, - split) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)): - download_and_decompress(self.archieves, DATA_HOME) - - wav_files = [] - for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)): - for file in files: - if file.endswith('.wav'): - wav_files.append(os.path.join(root, file)) - - random.seed(seed) # shuffle samples to split data - random.shuffle( - wav_files - ) # make sure using the same seed to create train and dev dataset - meta_info = self._get_meta_info(wav_files) - - files = [] - labels = [] - n_samples_per_fold = len(meta_info) // n_folds - for idx, sample in enumerate(meta_info): - _, _, emotion = sample - target = self.label_list.index(emotion) - fold = idx // n_samples_per_fold + 1 - - if mode == 'train' and int(fold) != split: - files.append(wav_files[idx]) - labels.append(target) - - if mode != 'train' and int(fold) == split: - files.append(wav_files[idx]) - labels.append(target) - - return files, labels diff --git a/paddlespeech/audio/datasets/urban_sound.py b/paddlespeech/audio/datasets/urban_sound.py deleted file mode 100644 index 0389cd5f9..000000000 --- a/paddlespeech/audio/datasets/urban_sound.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -from typing import List -from typing import Tuple - -from ..utils import DATA_HOME -from ..utils.download import download_and_decompress -from .dataset import AudioClassificationDataset - -__all__ = ['UrbanSound8K'] - - -class UrbanSound8K(AudioClassificationDataset): - """ - UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban - sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark, - drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The - classes are drawn from the urban sound taxonomy. - - Reference: - A Dataset and Taxonomy for Urban Sound Research - https://dl.acm.org/doi/10.1145/2647868.2655045 - """ - - archieves = [ - { - 'url': - 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz', - 'md5': '9aa69802bbf37fb986f71ec1483a196e', - }, - ] - label_list = [ - "air_conditioner", "car_horn", "children_playing", "dog_bark", - "drilling", "engine_idling", "gun_shot", "jackhammer", "siren", - "street_music" - ] - meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv') - meta_info = collections.namedtuple( - 'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold', - 'class_id', 'label')) - audio_path = os.path.join('UrbanSound8K', 'audio') - - def __init__(self, - mode: str='train', - split: int=1, - feat_type: str='raw', - **kwargs): - files, labels = self._get_data(mode, split) - super(UrbanSound8K, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - split (:obj:`int`, `optional`, defaults to 1): - It specify the fold of dev dataset. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - - def _get_meta_info(self): - ret = [] - with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: - for line in rf.readlines()[1:]: - ret.append(self.meta_info(*line.strip().split(','))) - return ret - - def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, DATA_HOME) - - meta_info = self._get_meta_info() - - files = [] - labels = [] - for sample in meta_info: - filename, _, _, _, _, fold, target, _ = sample - if mode == 'train' and int(fold) != split: - files.append( - os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', - filename)) - labels.append(int(target)) - - if mode != 'train' and int(fold) == split: - files.append( - os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', - filename)) - labels.append(int(target)) - - return files, labels diff --git a/paddlespeech/audio/datasets/voxceleb.py b/paddlespeech/audio/datasets/voxceleb.py deleted file mode 100644 index e1a8aa38b..000000000 --- a/paddlespeech/audio/datasets/voxceleb.py +++ /dev/null @@ -1,355 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import csv -import glob -import os -import random -from multiprocessing import cpu_count -from typing import List - -from paddle.io import Dataset -from pathos.multiprocessing import Pool -from tqdm import tqdm - -from ..utils import DATA_HOME -from ..utils import decompress -from ..utils.download import download_and_decompress -from .dataset import feat_funcs - -__all__ = ['VoxCeleb'] - - -class VoxCeleb(Dataset): - source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/' - archieves_audio_dev = [ - { - 'url': source_url + 'vox1_dev_wav_partaa', - 'md5': 'e395d020928bc15670b570a21695ed96', - }, - { - 'url': source_url + 'vox1_dev_wav_partab', - 'md5': 'bbfaaccefab65d82b21903e81a8a8020', - }, - { - 'url': source_url + 'vox1_dev_wav_partac', - 'md5': '017d579a2a96a077f40042ec33e51512', - }, - { - 'url': source_url + 'vox1_dev_wav_partad', - 'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19', - }, - ] - archieves_audio_test = [ - { - 'url': source_url + 'vox1_test_wav.zip', - 'md5': '185fdc63c3c739954633d50379a3d102', - }, - ] - archieves_meta = [ - { - 'url': - 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt', - 'md5': - 'b73110731c9223c1461fe49cb48dddfc', - }, - ] - - num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 - sample_rate = 16000 - meta_info = collections.namedtuple( - 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) - base_path = os.path.join(DATA_HOME, 'vox1') - wav_path = os.path.join(base_path, 'wav') - meta_path = os.path.join(base_path, 'meta') - veri_test_file = os.path.join(meta_path, 'veri_test2.txt') - csv_path = os.path.join(base_path, 'csv') - subsets = ['train', 'dev', 'enroll', 'test'] - - def __init__( - self, - subset: str='train', - feat_type: str='raw', - random_chunk: bool=True, - chunk_duration: float=3.0, # seconds - split_ratio: float=0.9, # train split ratio - seed: int=0, - target_dir: str=None, - vox2_base_path=None, - **kwargs): - """VoxCeleb data prepare and get the specific dataset audio info - - Args: - subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'. - feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'. - random_chunk (bool, optional): random select a duration from audio. Defaults to True. - chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0. - target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None. - vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None. - """ - assert subset in self.subsets, \ - 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) - - self.subset = subset - self.spk_id2label = {} - self.feat_type = feat_type - self.feat_config = kwargs - self.random_chunk = random_chunk - self.chunk_duration = chunk_duration - self.split_ratio = split_ratio - self.target_dir = target_dir if target_dir else VoxCeleb.base_path - self.vox2_base_path = vox2_base_path - - # if we set the target dir, we will change the vox data info data from base path to target dir - VoxCeleb.csv_path = os.path.join( - target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path - VoxCeleb.meta_path = os.path.join( - target_dir, "voxceleb", - 'meta') if target_dir else VoxCeleb.meta_path - VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path, - 'veri_test2.txt') - # self._data = self._get_data()[:1000] # KP: Small dataset test. - self._data = self._get_data() - super(VoxCeleb, self).__init__() - - # Set up a seed to reproduce training or predicting result. - # random.seed(seed) - - def _get_data(self): - # Download audio files. - # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir - # so, we check the vox1/wav dir status - print(f"wav base path: {self.wav_path}") - if not os.path.isdir(self.wav_path): - print("start to download the voxceleb1 dataset") - download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip - self.archieves_audio_dev, - self.base_path, - decompress=False) - download_and_decompress( # download the vox1_test_wav.zip and unzip - self.archieves_audio_test, - self.base_path, - decompress=True) - - # Download all parts and concatenate the files into one zip file. - dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip') - print(f'Concatenating all parts to: {dev_zipfile}') - os.system( - f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}' - ) - - # Extract all audio files of dev and test set. - decompress(dev_zipfile, self.base_path) - - # Download meta files. - if not os.path.isdir(self.meta_path): - print("prepare the meta data") - download_and_decompress( - self.archieves_meta, self.meta_path, decompress=False) - - # Data preparation. - if not os.path.isdir(self.csv_path): - os.makedirs(self.csv_path) - self.prepare_data() - - data = [] - print( - f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}" - ) - with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: - for line in rf.readlines()[1:]: - audio_id, duration, wav, start, stop, spk_id = line.strip( - ).split(',') - data.append( - self.meta_info(audio_id, - float(duration), wav, - int(start), int(stop), spk_id)) - - with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f: - for line in f.readlines(): - spk_id, label = line.strip().split(' ') - self.spk_id2label[spk_id] = int(label) - - return data - - def _convert_to_record(self, idx: int): - sample = self._data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = paddlespeech.audio.load(record['wav']) - - # random select a chunk audio samples from the audio - if self.random_chunk: - num_wav_samples = waveform.shape[0] - num_chunk_samples = int(self.chunk_duration * sr) - start = random.randint(0, num_wav_samples - num_chunk_samples - 1) - stop = start + num_chunk_samples - else: - start = record['start'] - stop = record['stop'] - - waveform = waveform[start:stop] - - assert self.feat_type in feat_funcs.keys(), \ - f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" - feat_func = feat_funcs[self.feat_type] - feat = feat_func( - waveform, sr=sr, **self.feat_config) if feat_func else waveform - - record.update({'feat': feat}) - if self.subset in ['train', - 'dev']: # Labels are available in train and dev. - record.update({'label': self.spk_id2label[record['spk_id']]}) - - return record - - @staticmethod - def _get_chunks(seg_dur, audio_id, audio_duration): - num_chunks = int(audio_duration / seg_dur) # all in milliseconds - - chunk_lst = [ - audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) - for i in range(num_chunks) - ] - return chunk_lst - - def _get_audio_info(self, wav_file: str, - split_chunks: bool) -> List[List[str]]: - waveform, sr = paddlespeech.audio.load(wav_file) - spk_id, sess_id, utt_id = wav_file.split("/")[-3:] - audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]]) - audio_duration = waveform.shape[0] / sr - - ret = [] - if split_chunks: # Split into pieces of self.chunk_duration seconds. - uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, - audio_duration) - - for chunk in uniq_chunks_list: - s, e = chunk.split("_")[-2:] # Timestamps of start and end - start_sample = int(float(s) * sr) - end_sample = int(float(e) * sr) - # id, duration, wav, start, stop, spk_id - ret.append([ - chunk, audio_duration, wav_file, start_sample, end_sample, - spk_id - ]) - else: # Keep whole audio. - ret.append([ - audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id - ]) - return ret - - def generate_csv(self, - wav_files: List[str], - output_file: str, - split_chunks: bool=True): - print(f'Generating csv: {output_file}') - header = ["id", "duration", "wav", "start", "stop", "spk_id"] - # Note: this may occurs c++ execption, but the program will execute fine - # so we can ignore the execption - with Pool(cpu_count()) as p: - infos = list( - tqdm( - p.imap(lambda x: self._get_audio_info(x, split_chunks), - wav_files), - total=len(wav_files))) - - csv_lines = [] - for info in infos: - csv_lines.extend(info) - - with open(output_file, mode="w") as csv_f: - csv_writer = csv.writer( - csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) - csv_writer.writerow(header) - for line in csv_lines: - csv_writer.writerow(line) - - def prepare_data(self): - # Audio of speakers in veri_test_file should not be included in training set. - print("start to prepare the data csv file") - enroll_files = set() - test_files = set() - # get the enroll and test audio file path - with open(self.veri_test_file, 'r') as f: - for line in f.readlines(): - _, enrol_file, test_file = line.strip().split(' ') - enroll_files.add(os.path.join(self.wav_path, enrol_file)) - test_files.add(os.path.join(self.wav_path, test_file)) - enroll_files = sorted(enroll_files) - test_files = sorted(test_files) - - # get the enroll and test speakers - test_spks = set() - for file in (enroll_files + test_files): - spk = file.split('/wav/')[1].split('/')[0] - test_spks.add(spk) - - # get all the train and dev audios file path - audio_files = [] - speakers = set() - print("Getting file list...") - for path in [self.wav_path, self.vox2_base_path]: - # if vox2 directory is not set and vox2 is not a directory - # we will not process this directory - if not path or not os.path.exists(path): - print(f"{path} is an invalid path, please check again, " - "and we will ignore the vox2 base path") - continue - for file in glob.glob( - os.path.join(path, "**", "*.wav"), recursive=True): - spk = file.split('/wav/')[1].split('/')[0] - if spk in test_spks: - continue - speakers.add(spk) - audio_files.append(file) - - print( - f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}" - ) - # encode the train and dev speakers label to spk_id2label.txt - with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f: - for label, spk_id in enumerate( - sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2 - f.write(f'{spk_id} {label}\n') - - audio_files = sorted(audio_files) - random.shuffle(audio_files) - split_idx = int(self.split_ratio * len(audio_files)) - # split_ratio to train - train_files, dev_files = audio_files[:split_idx], audio_files[ - split_idx:] - - self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv')) - self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv')) - - self.generate_csv( - enroll_files, - os.path.join(self.csv_path, 'enroll.csv'), - split_chunks=False) - self.generate_csv( - test_files, - os.path.join(self.csv_path, 'test.csv'), - split_chunks=False) - - def __getitem__(self, idx): - return self._convert_to_record(idx) - - def __len__(self): - return len(self._data) diff --git a/paddlespeech/audio/features/__init__.py b/paddlespeech/audio/features/__init__.py deleted file mode 100644 index 00781397f..000000000 --- a/paddlespeech/audio/features/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .layers import LogMelSpectrogram -from .layers import MelSpectrogram -from .layers import MFCC -from .layers import Spectrogram diff --git a/paddlespeech/audio/features/layers.py b/paddlespeech/audio/features/layers.py deleted file mode 100644 index 292363e64..000000000 --- a/paddlespeech/audio/features/layers.py +++ /dev/null @@ -1,328 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from functools import partial -from typing import Optional -from typing import Union - -import paddle -import paddle.nn as nn -from paddle import Tensor - -from ..functional import compute_fbank_matrix -from ..functional import create_dct -from ..functional import power_to_db -from ..functional.window import get_window - -__all__ = [ - 'Spectrogram', - 'MelSpectrogram', - 'LogMelSpectrogram', - 'MFCC', -] - - -class Spectrogram(nn.Layer): - """Compute spectrogram of given signals, typically audio waveforms. - The spectorgram is defined as the complex norm of the short-time Fourier transformation. - - Args: - n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. - hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. - win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. - power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. - dtype (str, optional): Data type of input and window. Defaults to 'float32'. - """ - - def __init__(self, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - power: float=2.0, - center: bool=True, - pad_mode: str='reflect', - dtype: str='float32') -> None: - super(Spectrogram, self).__init__() - - assert power > 0, 'Power of spectrogram must be > 0.' - self.power = power - - if win_length is None: - win_length = n_fft - - self.fft_window = get_window( - window, win_length, fftbins=True, dtype=dtype) - self._stft = partial( - paddle.signal.stft, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=self.fft_window, - center=center, - pad_mode=pad_mode) - self.register_buffer('fft_window', self.fft_window) - - def forward(self, x: Tensor) -> Tensor: - """ - Args: - x (Tensor): Tensor of waveforms with shape `(N, T)` - - Returns: - Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`. - """ - stft = self._stft(x) - spectrogram = paddle.pow(paddle.abs(stft), self.power) - return spectrogram - - -class MelSpectrogram(nn.Layer): - """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix. - - Args: - sr (int, optional): Sample rate. Defaults to 22050. - n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. - hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. - win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. - power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. - n_mels (int, optional): Number of mel bins. Defaults to 64. - f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. - f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. - norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. - dtype (str, optional): Data type of input and window. Defaults to 'float32'. - """ - - def __init__(self, - sr: int=22050, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - power: float=2.0, - center: bool=True, - pad_mode: str='reflect', - n_mels: int=64, - f_min: float=50.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - dtype: str='float32') -> None: - super(MelSpectrogram, self).__init__() - - self._spectrogram = Spectrogram( - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - power=power, - center=center, - pad_mode=pad_mode, - dtype=dtype) - self.n_mels = n_mels - self.f_min = f_min - self.f_max = f_max - self.htk = htk - self.norm = norm - if f_max is None: - f_max = sr // 2 - self.fbank_matrix = compute_fbank_matrix( - sr=sr, - n_fft=n_fft, - n_mels=n_mels, - f_min=f_min, - f_max=f_max, - htk=htk, - norm=norm, - dtype=dtype) # float64 for better numerical results - self.register_buffer('fbank_matrix', self.fbank_matrix) - - def forward(self, x: Tensor) -> Tensor: - """ - Args: - x (Tensor): Tensor of waveforms with shape `(N, T)` - - Returns: - Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`. - """ - spect_feature = self._spectrogram(x) - mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) - return mel_feature - - -class LogMelSpectrogram(nn.Layer): - """Compute log-mel-spectrogram feature of given signals, typically audio waveforms. - - Args: - sr (int, optional): Sample rate. Defaults to 22050. - n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. - hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. - win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. - power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. - n_mels (int, optional): Number of mel bins. Defaults to 64. - f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. - f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. - norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. - ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. - amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. - top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. - dtype (str, optional): Data type of input and window. Defaults to 'float32'. - """ - - def __init__(self, - sr: int=22050, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - power: float=2.0, - center: bool=True, - pad_mode: str='reflect', - n_mels: int=64, - f_min: float=50.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - ref_value: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=None, - dtype: str='float32') -> None: - super(LogMelSpectrogram, self).__init__() - - self._melspectrogram = MelSpectrogram( - sr=sr, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - power=power, - center=center, - pad_mode=pad_mode, - n_mels=n_mels, - f_min=f_min, - f_max=f_max, - htk=htk, - norm=norm, - dtype=dtype) - - self.ref_value = ref_value - self.amin = amin - self.top_db = top_db - - def forward(self, x: Tensor) -> Tensor: - """ - Args: - x (Tensor): Tensor of waveforms with shape `(N, T)` - - Returns: - Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`. - """ - mel_feature = self._melspectrogram(x) - log_mel_feature = power_to_db( - mel_feature, - ref_value=self.ref_value, - amin=self.amin, - top_db=self.top_db) - return log_mel_feature - - -class MFCC(nn.Layer): - """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. - - Args: - sr (int, optional): Sample rate. Defaults to 22050. - n_mfcc (int, optional): [description]. Defaults to 40. - n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. - hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. - win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. - power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. - n_mels (int, optional): Number of mel bins. Defaults to 64. - f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. - f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. - norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. - ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. - amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. - top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. - dtype (str, optional): Data type of input and window. Defaults to 'float32'. - """ - - def __init__(self, - sr: int=22050, - n_mfcc: int=40, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - power: float=2.0, - center: bool=True, - pad_mode: str='reflect', - n_mels: int=64, - f_min: float=50.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - ref_value: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=None, - dtype: str=paddle.float32) -> None: - super(MFCC, self).__init__() - assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( - n_mfcc, n_mels) - self._log_melspectrogram = LogMelSpectrogram( - sr=sr, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - power=power, - center=center, - pad_mode=pad_mode, - n_mels=n_mels, - f_min=f_min, - f_max=f_max, - htk=htk, - norm=norm, - ref_value=ref_value, - amin=amin, - top_db=top_db, - dtype=dtype) - self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype) - self.register_buffer('dct_matrix', self.dct_matrix) - - def forward(self, x: Tensor) -> Tensor: - """ - Args: - x (Tensor): Tensor of waveforms with shape `(N, T)` - - Returns: - Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`. - """ - log_mel_feature = self._log_melspectrogram(x) - mfcc = paddle.matmul( - log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose( - (0, 2, 1)) # (B, n_mels, L) - return mfcc diff --git a/paddlespeech/audio/functional/__init__.py b/paddlespeech/audio/functional/__init__.py deleted file mode 100644 index c85232df1..000000000 --- a/paddlespeech/audio/functional/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .functional import compute_fbank_matrix -from .functional import create_dct -from .functional import fft_frequencies -from .functional import hz_to_mel -from .functional import mel_frequencies -from .functional import mel_to_hz -from .functional import power_to_db diff --git a/paddlespeech/audio/functional/functional.py b/paddlespeech/audio/functional/functional.py deleted file mode 100644 index 19c63a9ae..000000000 --- a/paddlespeech/audio/functional/functional.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from librosa(https://github.com/librosa/librosa) -import math -from typing import Optional -from typing import Union - -import paddle -from paddle import Tensor - -__all__ = [ - 'hz_to_mel', - 'mel_to_hz', - 'mel_frequencies', - 'fft_frequencies', - 'compute_fbank_matrix', - 'power_to_db', - 'create_dct', -] - - -def hz_to_mel(freq: Union[Tensor, float], - htk: bool=False) -> Union[Tensor, float]: - """Convert Hz to Mels. - - Args: - freq (Union[Tensor, float]): The input tensor with arbitrary shape. - htk (bool, optional): Use htk scaling. Defaults to False. - - Returns: - Union[Tensor, float]: Frequency in mels. - """ - - if htk: - if isinstance(freq, Tensor): - return 2595.0 * paddle.log10(1.0 + freq / 700.0) - else: - return 2595.0 * math.log10(1.0 + freq / 700.0) - - # Fill in the linear part - f_min = 0.0 - f_sp = 200.0 / 3 - - mels = (freq - f_min) / f_sp - - # Fill in the log-scale part - - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = math.log(6.4) / 27.0 # step size for log region - - if isinstance(freq, Tensor): - target = min_log_mel + paddle.log( - freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 - mask = (freq > min_log_hz).astype(freq.dtype) - mels = target * mask + mels * ( - 1 - mask) # will replace by masked_fill OP in future - else: - if freq >= min_log_hz: - mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep - - return mels - - -def mel_to_hz(mel: Union[float, Tensor], - htk: bool=False) -> Union[float, Tensor]: - """Convert mel bin numbers to frequencies. - - Args: - mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape. - htk (bool, optional): Use htk scaling. Defaults to False. - - Returns: - Union[float, Tensor]: Frequencies in Hz. - """ - if htk: - return 700.0 * (10.0**(mel / 2595.0) - 1.0) - - f_min = 0.0 - f_sp = 200.0 / 3 - freqs = f_min + f_sp * mel - # And now the nonlinear scale - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = math.log(6.4) / 27.0 # step size for log region - if isinstance(mel, Tensor): - target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) - mask = (mel > min_log_mel).astype(mel.dtype) - freqs = target * mask + freqs * ( - 1 - mask) # will replace by masked_fill OP in future - else: - if mel >= min_log_mel: - freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) - - return freqs - - -def mel_frequencies(n_mels: int=64, - f_min: float=0.0, - f_max: float=11025.0, - htk: bool=False, - dtype: str='float32') -> Tensor: - """Compute mel frequencies. - - Args: - n_mels (int, optional): Number of mel bins. Defaults to 64. - f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. - fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. - htk (bool, optional): Use htk scaling. Defaults to False. - dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. - - Returns: - Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. - """ - # 'Center freqs' of mel bands - uniformly spaced between limits - min_mel = hz_to_mel(f_min, htk=htk) - max_mel = hz_to_mel(f_max, htk=htk) - mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) - freqs = mel_to_hz(mels, htk=htk) - return freqs - - -def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor: - """Compute fourier frequencies. - - Args: - sr (int): Sample rate. - n_fft (int): Number of fft bins. - dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. - - Returns: - Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. - """ - return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) - - -def compute_fbank_matrix(sr: int, - n_fft: int, - n_mels: int=64, - f_min: float=0.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - dtype: str='float32') -> Tensor: - """Compute fbank matrix. - - Args: - sr (int): Sample rate. - n_fft (int): Number of fft bins. - n_mels (int, optional): Number of mel bins. Defaults to 64. - f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. - f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - htk (bool, optional): Use htk scaling. Defaults to False. - norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'. - dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. - - Returns: - Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. - """ - - if f_max is None: - f_max = float(sr) / 2 - - # Initialize the weights - weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) - - # Center freqs of each FFT bin - fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) - - # 'Center freqs' of mel bands - uniformly spaced between limits - mel_f = mel_frequencies( - n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) - - fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) - ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) - #ramps = np.subtract.outer(mel_f, fftfreqs) - - for i in range(n_mels): - # lower and upper slopes for all bins - lower = -ramps[i] / fdiff[i] - upper = ramps[i + 2] / fdiff[i + 1] - - # .. then intersect them with each other and zero - weights[i] = paddle.maximum( - paddle.zeros_like(lower), paddle.minimum(lower, upper)) - - # Slaney-style mel is scaled to be approx constant energy per channel - if norm == 'slaney': - enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) - weights *= enorm.unsqueeze(1) - elif isinstance(norm, int) or isinstance(norm, float): - weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) - - return weights - - -def power_to_db(spect: Tensor, - ref_value: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=None) -> Tensor: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. - - Args: - spect (Tensor): STFT power spectrogram. - ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. - amin (float, optional): Minimum threshold. Defaults to 1e-10. - top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. - - Returns: - Tensor: Power spectrogram in db scale. - """ - if amin <= 0: - raise Exception("amin must be strictly positive") - - if ref_value <= 0: - raise Exception("ref_value must be strictly positive") - - ones = paddle.ones_like(spect) - log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect)) - log_spec -= 10.0 * math.log10(max(ref_value, amin)) - - if top_db is not None: - if top_db < 0: - raise Exception("top_db must be non-negative") - log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) - - return log_spec - - -def create_dct(n_mfcc: int, - n_mels: int, - norm: Optional[str]='ortho', - dtype: str='float32') -> Tensor: - """Create a discrete cosine transform(DCT) matrix. - - Args: - n_mfcc (int): Number of mel frequency cepstral coefficients. - n_mels (int): Number of mel filterbanks. - norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'. - dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. - - Returns: - Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. - """ - n = paddle.arange(n_mels, dtype=dtype) - k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) - dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) * - k) # size (n_mfcc, n_mels) - if norm is None: - dct *= 2.0 - else: - assert norm == "ortho" - dct[0] *= 1.0 / math.sqrt(2.0) - dct *= math.sqrt(2.0 / float(n_mels)) - return dct.T diff --git a/paddlespeech/audio/functional/window.py b/paddlespeech/audio/functional/window.py deleted file mode 100644 index c99d50462..000000000 --- a/paddlespeech/audio/functional/window.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -import math -from typing import List -from typing import Tuple -from typing import Union - -import paddle -from paddle import Tensor - -__all__ = [ - 'get_window', -] - - -def _cat(x: List[Tensor], data_type: str) -> Tensor: - l = [paddle.to_tensor(_, data_type) for _ in x] - return paddle.concat(l) - - -def _acosh(x: Union[Tensor, float]) -> Tensor: - if isinstance(x, float): - return math.log(x + math.sqrt(x**2 - 1)) - return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) - - -def _extend(M: int, sym: bool) -> bool: - """Extend window by 1 sample if needed for DFT-even symmetry. """ - if not sym: - return M + 1, True - else: - return M, False - - -def _len_guards(M: int) -> bool: - """Handle small or incorrect window lengths. """ - if int(M) != M or M < 0: - raise ValueError('Window length M must be a non-negative integer') - - return M <= 1 - - -def _truncate(w: Tensor, needed: bool) -> Tensor: - """Truncate window by 1 sample if needed for DFT-even symmetry. """ - if needed: - return w[:-1] - else: - return w - - -def _general_gaussian(M: int, p, sig, sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a window with a generalized Gaussian shape. - This function is consistent with scipy.signal.windows.general_gaussian(). - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - - n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 - w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p)) - - return _truncate(w, needs_trunc) - - -def _general_cosine(M: int, a: float, sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a generic weighted sum of cosine terms window. - This function is consistent with scipy.signal.windows.general_cosine(). - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) - w = paddle.zeros((M, ), dtype=dtype) - for k in range(len(a)): - w += a[k] * paddle.cos(k * fac) - return _truncate(w, needs_trunc) - - -def _general_hamming(M: int, alpha: float, sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a generalized Hamming window. - This function is consistent with scipy.signal.windows.general_hamming() - """ - return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) - - -def _taylor(M: int, - nbar=4, - sll=30, - norm=True, - sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a Taylor window. - The Taylor window taper function approximates the Dolph-Chebyshev window's - constant sidelobe level for a parameterized number of near-in sidelobes. - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - # Original text uses a negative sidelobe level parameter and then negates - # it in the calculation of B. To keep consistent with other methods we - # assume the sidelobe level parameter to be positive. - B = 10**(sll / 20) - A = _acosh(B) / math.pi - s2 = nbar**2 / (A**2 + (nbar - 0.5)**2) - ma = paddle.arange(1, nbar, dtype=dtype) - - Fm = paddle.empty((nbar - 1, ), dtype=dtype) - signs = paddle.empty_like(ma) - signs[::2] = 1 - signs[1::2] = -1 - m2 = ma * ma - for mi in range(len(ma)): - numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2 - )) - if mi == 0: - denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:]) - elif mi == len(ma) - 1: - denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) - else: - denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[ - mi] / m2[mi + 1:]) - - Fm[mi] = numer / denom - - def W(n): - return 1 + 2 * paddle.matmul( - Fm.unsqueeze(0), - paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M)) - - w = W(paddle.arange(0, M, dtype=dtype)) - - # normalize (Note that this is not described in the original text [1]) - if norm: - scale = 1.0 / W((M - 1) / 2) - w *= scale - w = w.squeeze() - return _truncate(w, needs_trunc) - - -def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a Hamming window. - The Hamming window is a taper formed by using a raised cosine with - non-zero endpoints, optimized to minimize the nearest side lobe. - """ - return _general_hamming(M, 0.54, sym, dtype=dtype) - - -def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a Hann window. - The Hann window is a taper formed by using a raised cosine or sine-squared - with ends that touch zero. - """ - return _general_hamming(M, 0.5, sym, dtype=dtype) - - -def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a Tukey window. - The Tukey window is also known as a tapered cosine window. - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - - if alpha <= 0: - return paddle.ones((M, ), dtype=dtype) - elif alpha >= 1.0: - return hann(M, sym=sym) - - M, needs_trunc = _extend(M, sym) - - n = paddle.arange(0, M, dtype=dtype) - width = int(alpha * (M - 1) / 2.0) - n1 = n[0:width + 1] - n2 = n[width + 1:M - width - 1] - n3 = n[M - width - 1:] - - w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1)))) - w2 = paddle.ones(n2.shape, dtype=dtype) - w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / - (M - 1)))) - w = paddle.concat([w1, w2, w3]) - - return _truncate(w, needs_trunc) - - -def _kaiser(M: int, beta: float, sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a Kaiser window. - The Kaiser window is a taper formed by using a Bessel function. - """ - raise NotImplementedError() - - -def _gaussian(M: int, std: float, sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a Gaussian window. - The Gaussian widows has a Gaussian shape defined by the standard deviation(std). - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - - n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 - sig2 = 2 * std * std - w = paddle.exp(-n**2 / sig2) - - return _truncate(w, needs_trunc) - - -def _exponential(M: int, - center=None, - tau=1., - sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute an exponential (or Poisson) window. """ - if sym and center is not None: - raise ValueError("If sym==True, center must be None.") - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - - if center is None: - center = (M - 1) / 2 - - n = paddle.arange(0, M, dtype=dtype) - w = paddle.exp(-paddle.abs(n - center) / tau) - - return _truncate(w, needs_trunc) - - -def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a triangular window. - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - - n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype) - if M % 2 == 0: - w = (2 * n - 1.0) / M - w = paddle.concat([w, w[::-1]]) - else: - w = 2 * n / (M + 1.0) - w = paddle.concat([w, w[-2::-1]]) - - return _truncate(w, needs_trunc) - - -def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a Bohman window. - The Bohman window is the autocorrelation of a cosine window. - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - - fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1]) - w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin( - math.pi * fac) - w = _cat([0, w, 0], dtype) - - return _truncate(w, needs_trunc) - - -def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a Blackman window. - The Blackman window is a taper formed by using the first three terms of - a summation of cosines. It was designed to have close to the minimal - leakage possible. It is close to optimal, only slightly worse than a - Kaiser window. - """ - return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) - - -def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: - """Compute a window with a simple cosine shape. - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5)) - - return _truncate(w, needs_trunc) - - -def get_window(window: Union[str, Tuple[str, float]], - win_length: int, - fftbins: bool=True, - dtype: str='float64') -> Tensor: - """Return a window of a given length and type. - - Args: - window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. - win_length (int): Number of samples. - fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. - dtype (str, optional): The data type of the return window. Defaults to 'float64'. - - Returns: - Tensor: The window represented as a tensor. - """ - sym = not fftbins - - args = () - if isinstance(window, tuple): - winstr = window[0] - if len(window) > 1: - args = window[1:] - elif isinstance(window, str): - if window in ['gaussian', 'exponential']: - raise ValueError("The '" + window + "' window needs one or " - "more parameters -- pass a tuple.") - else: - winstr = window - else: - raise ValueError("%s as window type is not supported." % - str(type(window))) - - try: - winfunc = eval('_' + winstr) - except KeyError as e: - raise ValueError("Unknown window type.") from e - - params = (win_length, ) + args - kwargs = {'sym': sym} - return winfunc(*params, dtype=dtype, **kwargs) diff --git a/paddlespeech/audio/io/__init__.py b/paddlespeech/audio/io/__init__.py deleted file mode 100644 index 185a92b8d..000000000 --- a/paddlespeech/audio/io/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/paddlespeech/audio/metric/__init__.py b/paddlespeech/audio/metric/__init__.py deleted file mode 100644 index 7ce6f5cff..000000000 --- a/paddlespeech/audio/metric/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .eer import compute_eer -from .eer import compute_minDCF diff --git a/paddlespeech/audio/metric/eer.py b/paddlespeech/audio/metric/eer.py deleted file mode 100644 index a1166d3f9..000000000 --- a/paddlespeech/audio/metric/eer.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import List - -import numpy as np -import paddle -from sklearn.metrics import roc_curve - - -def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]: - """Compute EER and return score threshold. - - Args: - labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num - scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num - - Returns: - List[float]: eer and the specific threshold - """ - fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores) - fnr = 1 - tpr - eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))] - eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] - return eer, eer_threshold - - -def compute_minDCF(positive_scores, - negative_scores, - c_miss=1.0, - c_fa=1.0, - p_target=0.01): - """ - This is modified from SpeechBrain - https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509 - Computes the minDCF metric normally used to evaluate speaker verification - systems. The min_DCF is the minimum of the following C_det function computed - within the defined threshold range: - - C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target) - - where p_miss is the missing probability and p_fa is the probability of having - a false alarm. - - Args: - positive_scores (Paddle.Tensor): The scores from entries of the same class. - negative_scores (Paddle.Tensor): The scores from entries of different classes. - c_miss (float, optional): Cost assigned to a missing error (default 1.0). - c_fa (float, optional): Cost assigned to a false alarm (default 1.0). - p_target (float, optional): Prior probability of having a target (default 0.01). - - Returns: - List[float]: min dcf and the specific threshold - """ - # Computing candidate thresholds - if len(positive_scores.shape) > 1: - positive_scores = positive_scores.squeeze() - - if len(negative_scores.shape) > 1: - negative_scores = negative_scores.squeeze() - - thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores])) - thresholds = paddle.unique(thresholds) - - # Adding intermediate thresholds - interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2 - thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds])) - - # Computing False Rejection Rate (miss detection) - positive_scores = paddle.concat( - len(thresholds) * [positive_scores.unsqueeze(0)]) - pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds - p_miss = (pos_scores_threshold.sum(0) - ).astype("float32") / positive_scores.shape[1] - del positive_scores - del pos_scores_threshold - - # Computing False Acceptance Rate (false alarm) - negative_scores = paddle.concat( - len(thresholds) * [negative_scores.unsqueeze(0)]) - neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds - p_fa = (neg_scores_threshold.sum(0) - ).astype("float32") / negative_scores.shape[1] - del negative_scores - del neg_scores_threshold - - c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target) - c_min = paddle.min(c_det, axis=0) - min_index = paddle.argmin(c_det, axis=0) - return float(c_min), float(thresholds[min_index]) diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py index d7f7937bd..c3ff148f8 100644 --- a/paddlespeech/audio/streamdata/autodecode.py +++ b/paddlespeech/audio/streamdata/autodecode.py @@ -310,7 +310,7 @@ def paddle_audio(key, data): fname = os.path.join(dirname, f"file.{extension}") with open(fname, "wb") as stream: stream.write(data) - return paddlespeech.audio.load(fname) + return paddleaudio.backends.soundfile_load(fname) ################################################################ diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py index 79b81c0ce..39dbea621 100644 --- a/paddlespeech/audio/streamdata/tariterators.py +++ b/paddlespeech/audio/streamdata/tariterators.py @@ -111,7 +111,7 @@ def tar_file_iterator(fileobj, assert pos > 0 prefix, postfix = name[:pos], name[pos + 1:] if postfix == 'wav': - waveform, sample_rate = paddlespeech.audio.load( + waveform, sample_rate = paddleaudio.backends.soundfile_load( stream.extractfile(tarinfo), normal=False) result = dict( fname=prefix, wav=waveform, sample_rate=sample_rate) @@ -163,7 +163,7 @@ def tar_file_and_group_iterator(fileobj, if postfix == 'txt': example['txt'] = file_obj.read().decode('utf8').strip() elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = paddlespeech.audio.load( + waveform, sample_rate = paddleaudio.backends.soundfile_load( file_obj, normal=False) waveform = paddle.to_tensor( np.expand_dims(np.array(waveform), 0), diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 5ace7fe0d..5e9b5acec 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -25,8 +25,8 @@ import yaml from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper -from paddlespeech.audio.soundfile_backend import soundfile_load as load -from paddlespeech.audio.features import LogMelSpectrogram +from paddleaudio.backends import soundfile_load as load +from paddleaudio.features import LogMelSpectrogram __all__ = ['CLSExecutor'] diff --git a/paddlespeech/cli/kws/infer.py b/paddlespeech/cli/kws/infer.py index bd15e80e6..17482f653 100644 --- a/paddlespeech/cli/kws/infer.py +++ b/paddlespeech/cli/kws/infer.py @@ -24,8 +24,8 @@ import yaml from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio -from paddlespeech.audio.compliance.kaldi import fbank as kaldi_fbank +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.compliance.kaldi import fbank as kaldi_fbank __all__ = ['KWSExecutor'] diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 5a66b4861..b1335f281 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -27,8 +27,8 @@ from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio -from paddlespeech.audio.compliance.librosa import melspectrogram +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py index 93eee74b9..b13d037f5 100644 --- a/paddlespeech/cls/exps/panns/deploy/predict.py +++ b/paddlespeech/cls/exps/panns/deploy/predict.py @@ -18,9 +18,9 @@ import numpy as np from paddle import inference from scipy.special import softmax -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio -from paddlespeech.audio.datasets import ESC50 -from paddlespeech.audio.features import melspectrogram +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.datasets import ESC50 +from paddleaudio.features import melspectrogram # yapf: disable parser = argparse.ArgumentParser() diff --git a/paddlespeech/cls/exps/panns/export_model.py b/paddlespeech/cls/exps/panns/export_model.py index e62d58f02..c295c6a33 100644 --- a/paddlespeech/cls/exps/panns/export_model.py +++ b/paddlespeech/cls/exps/panns/export_model.py @@ -16,7 +16,7 @@ import os import paddle -from paddlespeech.audio.datasets import ESC50 +from paddleaudio.datasets import ESC50 from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py index 97759a89d..8064ab0d4 100644 --- a/paddlespeech/cls/exps/panns/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -18,9 +18,9 @@ import paddle import paddle.nn.functional as F import yaml -from paddlespeech.audio.backends import load as load_audio -from paddlespeech.audio.features import LogMelSpectrogram -from paddlespeech.audio.utils import logger +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.features import LogMelSpectrogram +from paddleaudio.utils import logger from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index fba38a01c..56082bd77 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,88 +11,97 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse import os import paddle -import yaml +from yacs.config import CfgNode -from paddlespeech.audio.features import LogMelSpectrogram -from paddlespeech.audio.utils import logger -from paddlespeech.audio.utils import Timer -from paddlespeech.cls.models import SoundClassifier -from paddlespeech.utils.dynamic_import import dynamic_import +from paddleaudio.utils import logger +from paddleaudio.utils import Timer +from paddlespeech.kws.exps.mdtc.collate import collate_features +from paddlespeech.kws.models.loss import max_pooling_loss +from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -args = parser.parse_args() -# yapf: enable +if __name__ == '__main__': + parser = default_argument_parser() + args = parser.parse_args() + + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) -if __name__ == "__main__": nranks = paddle.distributed.get_world_size() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() local_rank = paddle.distributed.get_rank() - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) - - model_conf = config['model'] - data_conf = config['data'] - feat_conf = config['feature'] - training_conf = config['training'] - # Dataset - ds_class = dynamic_import(data_conf['dataset']) - train_ds = ds_class(**data_conf['train']) - dev_ds = ds_class(**data_conf['dev']) + ds_class = dynamic_import(config['dataset']) + train_ds = ds_class( + data_dir=config['data_dir'], + mode='train', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) + dev_ds = ds_class( + data_dir=config['data_dir'], + mode='dev', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) + train_sampler = paddle.io.DistributedBatchSampler( train_ds, - batch_size=training_conf['batch_size'], + batch_size=config['batch_size'], shuffle=True, drop_last=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_sampler, - num_workers=training_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, - use_buffer_reader=True, ) - - # Feature - feature_extractor = LogMelSpectrogram(**feat_conf) + use_buffer_reader=True, + collate_fn=collate_features, ) # Model - backbone_class = dynamic_import(model_conf['backbone']) - backbone = backbone_class(pretrained=True, extract_embedding=True) - model = SoundClassifier(backbone, num_class=data_conf['num_classes']) + backbone_class = dynamic_import(config['backbone']) + backbone = backbone_class( + stack_num=config['stack_num'], + stack_size=config['stack_size'], + in_channels=config['in_channels'], + res_channels=config['res_channels'], + kernel_size=config['kernel_size'], ) + model = KWSModel(backbone=backbone, num_keywords=config['num_keywords']) model = paddle.DataParallel(model) + clip = paddle.nn.ClipGradByGlobalNorm(config['grad_clip']) optimizer = paddle.optimizer.Adam( - learning_rate=training_conf['learning_rate'], - parameters=model.parameters()) - criterion = paddle.nn.loss.CrossEntropyLoss() + learning_rate=config['learning_rate'], + weight_decay=config['weight_decay'], + parameters=model.parameters(), + grad_clip=clip) + criterion = max_pooling_loss steps_per_epoch = len(train_sampler) - timer = Timer(steps_per_epoch * training_conf['epochs']) + timer = Timer(steps_per_epoch * config['epochs']) timer.start() - for epoch in range(1, training_conf['epochs'] + 1): + for epoch in range(1, config['epochs'] + 1): model.train() avg_loss = 0 num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - waveforms, labels = batch - feats = feature_extractor( - waveforms - ) # Need a padding when lengths of waveforms differ in a batch. - feats = paddle.transpose(feats, [0, 2, 1]) # To [N, length, n_mels] - + keys, feats, labels, lengths = batch logits = model(feats) - - loss = criterion(logits, labels) + loss, corrects, acc = criterion(logits, labels, lengths) loss.backward() optimizer.step() if isinstance(optimizer._learning_rate, @@ -104,21 +113,18 @@ if __name__ == "__main__": avg_loss += loss.numpy()[0] # Calculate metrics - preds = paddle.argmax(logits, axis=1) - num_corrects += (preds == labels).numpy().sum() + num_corrects += corrects num_samples += feats.shape[0] timer.count() - if (batch_idx + 1 - ) % training_conf['log_freq'] == 0 and local_rank == 0: + if (batch_idx + 1) % config['log_freq'] == 0 and local_rank == 0: lr = optimizer.get_lr() - avg_loss /= training_conf['log_freq'] + avg_loss /= config['log_freq'] avg_acc = num_corrects / num_samples print_msg = 'Epoch={}/{}, Step={}/{}'.format( - epoch, training_conf['epochs'], batch_idx + 1, - steps_per_epoch) + epoch, config['epochs'], batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) print_msg += ' acc={:.4f}'.format(avg_acc) print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( @@ -129,41 +135,40 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 - if epoch % training_conf[ + if epoch % config[ 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: dev_sampler = paddle.io.BatchSampler( dev_ds, - batch_size=training_conf['batch_size'], + batch_size=config['batch_size'], shuffle=False, drop_last=False) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_sampler, - num_workers=training_conf['num_workers'], - return_list=True, ) + num_workers=config['num_workers'], + return_list=True, + use_buffer_reader=True, + collate_fn=collate_features, ) model.eval() num_corrects = 0 num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - waveforms, labels = batch - feats = feature_extractor(waveforms) - feats = paddle.transpose(feats, [0, 2, 1]) - + keys, feats, labels, lengths = batch logits = model(feats) - - preds = paddle.argmax(logits, axis=1) - num_corrects += (preds == labels).numpy().sum() + loss, corrects, acc = criterion(logits, labels, lengths) + num_corrects += corrects num_samples += feats.shape[0] + eval_acc = num_corrects / num_samples print_msg = '[Evaluation result]' - print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples) + print_msg += ' dev_acc={:.4f}'.format(eval_acc) logger.eval(print_msg) # Save model - save_dir = os.path.join(training_conf['checkpoint_dir'], + save_dir = os.path.join(config['checkpoint_dir'], 'epoch_{}'.format(epoch)) logger.info('Saving model checkpoint to {}'.format(save_dir)) paddle.save(model.state_dict(), diff --git a/paddlespeech/cls/models/panns/panns.py b/paddlespeech/cls/models/panns/panns.py index 37deae80c..feefecbe1 100644 --- a/paddlespeech/cls/models/panns/panns.py +++ b/paddlespeech/cls/models/panns/panns.py @@ -16,7 +16,7 @@ import os import paddle.nn as nn import paddle.nn.functional as F -from paddlespeech.audio.utils.download import load_state_dict_from_url +from paddleaudio.utils.download import load_state_dict_from_url from paddlespeech.utils.env import MODEL_HOME __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6'] diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index 94e45d590..56082bd77 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -16,8 +16,8 @@ import os import paddle from yacs.config import CfgNode -from paddlespeech.audio.utils import logger -from paddlespeech.audio.utils import Timer +from paddleaudio.utils import logger +from paddleaudio.utils import Timer from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.loss import max_pooling_loss from paddlespeech.kws.models.mdtc import KWSModel diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py index ac5720fd5..12e8a2966 100644 --- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py @@ -17,7 +17,7 @@ import paddle from python_speech_features import delta from python_speech_features import mfcc -import paddlespeech.audio.compliance.kaldi as kaldi +import paddleaudio.compliance.kaldi as kaldi class AudioFeaturizer(): diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 8a9849492..5c2fa3071 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -29,9 +29,9 @@ import paddle from paddle import jit from paddle import nn -from paddlespeech.audio.utils.tensor_utils import add_sos_eos -from paddlespeech.audio.utils.tensor_utils import pad_sequence -from paddlespeech.audio.utils.tensor_utils import th_accuracy +from paddleaudio.utils.tensor_utils import add_sos_eos +from paddleaudio.utils.tensor_utils import pad_sequence +from paddleaudio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index e8b61bc0d..1ba313c46 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -25,8 +25,8 @@ import paddle from paddle import jit from paddle import nn -from paddlespeech.audio.utils.tensor_utils import add_sos_eos -from paddlespeech.audio.utils.tensor_utils import th_accuracy +from paddleaudio.utils.tensor_utils import add_sos_eos +from paddleaudio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.modules.cmvn import GlobalCMVN diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index e617c3650..ecbdbfa5a 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -17,8 +17,8 @@ from collections import OrderedDict import numpy as np import paddle -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio -from paddlespeech.audio.compliance.librosa import melspectrogram +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.server.engine.base_engine import BaseEngine diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py index 32546a330..ac92cf666 100644 --- a/paddlespeech/server/util.py +++ b/paddlespeech/server/util.py @@ -28,7 +28,7 @@ import requests import yaml from paddle.framework import load -import paddlespeech.audio +import paddleaudio from .entry import client_commands from .entry import server_commands from paddlespeech.cli import download @@ -289,7 +289,7 @@ def _note_one_stat(cls_name, params={}): if 'audio_file' in params: try: - _, sr = paddlespeech.audio.load(params['audio_file']) + _, sr = paddleaudio.backends.soundfile_load(params['audio_file']) except Exception: sr = -1 diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py index e9203ef99..790a4eb67 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py @@ -18,8 +18,8 @@ import time import paddle from yacs.config import CfgNode -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio -from paddlespeech.audio.compliance.librosa import melspectrogram +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py index 6c87dbe7b..1b38075d6 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/test.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py @@ -21,7 +21,7 @@ from paddle.io import DataLoader from tqdm import tqdm from yacs.config import CfgNode -from paddlespeech.audio.metric import compute_eer +from paddleaudio.metric import compute_eer from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.batch import batch_feature_normalize from paddlespeech.vector.io.dataset import CSVDataset diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py index 961b75e29..73da16dc7 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/train.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py @@ -22,7 +22,7 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from yacs.config import CfgNode -from paddlespeech.audio.compliance.librosa import melspectrogram +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.augment import build_augment_pipeline from paddlespeech.vector.io.augment import waveform_augment diff --git a/paddlespeech/vector/io/dataset.py b/paddlespeech/vector/io/dataset.py index c9d56b5ea..1fa8b6b99 100644 --- a/paddlespeech/vector/io/dataset.py +++ b/paddlespeech/vector/io/dataset.py @@ -16,9 +16,9 @@ from dataclasses import fields from paddle.io import Dataset -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio +from paddleaudio.backends import soundfile_load as load_audio -from paddlespeech.audio.compliance.librosa import melspectrogram +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() diff --git a/paddlespeech/vector/io/dataset_from_json.py b/paddlespeech/vector/io/dataset_from_json.py index 32960e456..39b92af66 100644 --- a/paddlespeech/vector/io/dataset_from_json.py +++ b/paddlespeech/vector/io/dataset_from_json.py @@ -17,9 +17,9 @@ from dataclasses import fields from paddle.io import Dataset -from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio -from paddlespeech.audio.compliance.librosa import melspectrogram -from paddlespeech.audio.compliance.librosa import mfcc +from paddleaudio.backends import soundfile_load as load_audio +from paddleaudio.compliance.librosa import melspectrogram +from paddleaudio.compliance.librosa import mfcc @dataclass diff --git a/tests/benchmark/audio/README.md b/tests/benchmark/audio/README.md deleted file mode 100644 index 9cade74e0..000000000 --- a/tests/benchmark/audio/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# 1. Prepare -First, install `pytest-benchmark` via pip. -```sh -pip install pytest-benchmark -``` - -# 2. Run -Run the specific script for profiling. -```sh -pytest melspectrogram.py -``` - -Result: -```sh -========================================================================== test session starts ========================================================================== -platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0 -benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000) -plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0 -collected 4 items - -melspectrogram.py .... [100%] - - --------------------------------------------------------------------------------------------------- benchmark: 4 tests ------------------------------------------------------------------------------------------------- -Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ -test_melspect_gpu_torchaudio 202.0765 (1.0) 360.6230 (1.0) 218.1168 (1.0) 16.3022 (1.0) 214.2871 (1.0) 21.8451 (1.0) 40;3 4,584.7001 (1.0) 286 1 -test_melspect_gpu 657.8509 (3.26) 908.0470 (2.52) 724.2545 (3.32) 106.5771 (6.54) 669.9096 (3.13) 113.4719 (5.19) 1;0 1,380.7300 (0.30) 5 1 -test_melspect_cpu_torchaudio 1,247.6053 (6.17) 2,892.5799 (8.02) 1,443.2853 (6.62) 345.3732 (21.19) 1,262.7263 (5.89) 221.6385 (10.15) 56;53 692.8637 (0.15) 399 1 -test_melspect_cpu 20,326.2549 (100.59) 20,607.8682 (57.15) 20,473.4125 (93.86) 63.8654 (3.92) 20,467.0429 (95.51) 68.4294 (3.13) 8;1 48.8438 (0.01) 29 1 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ - -Legend: - Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. - OPS: Operations Per Second, computed as 1 / Mean -========================================================================== 4 passed in 21.12s =========================================================================== - -``` diff --git a/tests/benchmark/audio/log_melspectrogram.py b/tests/benchmark/audio/log_melspectrogram.py deleted file mode 100644 index c85fcecfb..000000000 --- a/tests/benchmark/audio/log_melspectrogram.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import urllib.request - -import librosa -import numpy as np -import paddle -import torch -import torchaudio - -import paddlespeech.audio - -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' -if not os.path.isfile(os.path.basename(wav_url)): - urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) - -waveform, sr = paddlespeech.audio.load( - os.path.abspath(os.path.basename(wav_url))) -waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) -waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) - -# Feature conf -mel_conf = { - 'sr': sr, - 'n_fft': 512, - 'hop_length': 128, - 'n_mels': 40, -} - -mel_conf_torchaudio = { - 'sample_rate': sr, - 'n_fft': 512, - 'hop_length': 128, - 'n_mels': 40, - 'norm': 'slaney', - 'mel_scale': 'slaney', -} - - -def enable_cpu_device(): - paddle.set_device('cpu') - - -def enable_gpu_device(): - paddle.set_device('gpu') - - -log_mel_extractor = paddlespeech.audio.features.LogMelSpectrogram( - **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype) - - -def log_melspectrogram(): - return log_mel_extractor(waveform_tensor).squeeze(0) - - -def test_log_melspect_cpu(benchmark): - enable_cpu_device() - feature_audio = benchmark(log_melspectrogram) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -def test_log_melspect_gpu(benchmark): - enable_gpu_device() - feature_audio = benchmark(log_melspectrogram) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=2) - - -mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram( - **mel_conf_torchaudio, f_min=0.0) -amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db=80.0) - - -def melspectrogram_torchaudio(): - return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0) - - -def log_melspectrogram_torchaudio(): - mel_specgram = mel_extractor_torchaudio(waveform_tensor_torch) - return amplitude_to_DB(mel_specgram).squeeze(0) - - -def test_log_melspect_cpu_torchaudio(benchmark): - global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB - - mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu') - waveform_tensor_torch = waveform_tensor_torch.to('cpu') - amplitude_to_DB = amplitude_to_DB.to('cpu') - - feature_audio = benchmark(log_melspectrogram_torchaudio) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -def test_log_melspect_gpu_torchaudio(benchmark): - global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB - - mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda') - waveform_tensor_torch = waveform_tensor_torch.to('cuda') - amplitude_to_DB = amplitude_to_DB.to('cuda') - - feature_torchaudio = benchmark(log_melspectrogram_torchaudio) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) - np.testing.assert_array_almost_equal( - feature_librosa, feature_torchaudio.cpu(), decimal=2) diff --git a/tests/benchmark/audio/melspectrogram.py b/tests/benchmark/audio/melspectrogram.py deleted file mode 100644 index 498158941..000000000 --- a/tests/benchmark/audio/melspectrogram.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import urllib.request - -import librosa -import numpy as np -import paddle -import torch -import torchaudio - -import paddlespeech.audio - -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' -if not os.path.isfile(os.path.basename(wav_url)): - urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) - -waveform, sr = paddlespeech.audio.load( - os.path.abspath(os.path.basename(wav_url))) -waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) -waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) - -# Feature conf -mel_conf = { - 'sr': sr, - 'n_fft': 512, - 'hop_length': 128, - 'n_mels': 40, -} - -mel_conf_torchaudio = { - 'sample_rate': sr, - 'n_fft': 512, - 'hop_length': 128, - 'n_mels': 40, - 'norm': 'slaney', - 'mel_scale': 'slaney', -} - - -def enable_cpu_device(): - paddle.set_device('cpu') - - -def enable_gpu_device(): - paddle.set_device('gpu') - - -mel_extractor = paddlespeech.audio.features.MelSpectrogram( - **mel_conf, f_min=0.0, dtype=waveform_tensor.dtype) - - -def melspectrogram(): - return mel_extractor(waveform_tensor).squeeze(0) - - -def test_melspect_cpu(benchmark): - enable_cpu_device() - feature_audio = benchmark(melspectrogram) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -def test_melspect_gpu(benchmark): - enable_gpu_device() - feature_audio = benchmark(melspectrogram) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram( - **mel_conf_torchaudio, f_min=0.0) - - -def melspectrogram_torchaudio(): - return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0) - - -def test_melspect_cpu_torchaudio(benchmark): - global waveform_tensor_torch, mel_extractor_torchaudio - mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu') - waveform_tensor_torch = waveform_tensor_torch.to('cpu') - feature_audio = benchmark(melspectrogram_torchaudio) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -def test_melspect_gpu_torchaudio(benchmark): - global waveform_tensor_torch, mel_extractor_torchaudio - mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda') - waveform_tensor_torch = waveform_tensor_torch.to('cuda') - feature_torchaudio = benchmark(melspectrogram_torchaudio) - feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_torchaudio.cpu(), decimal=3) diff --git a/tests/benchmark/audio/mfcc.py b/tests/benchmark/audio/mfcc.py deleted file mode 100644 index 4e286de90..000000000 --- a/tests/benchmark/audio/mfcc.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import urllib.request - -import librosa -import numpy as np -import paddle -import torch -import torchaudio - -import paddlespeech.audio - -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' -if not os.path.isfile(os.path.basename(wav_url)): - urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) - -waveform, sr = paddlespeech.audio.load( - os.path.abspath(os.path.basename(wav_url))) -waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) -waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) - -# Feature conf -mel_conf = { - 'sr': sr, - 'n_fft': 512, - 'hop_length': 128, - 'n_mels': 40, -} -mfcc_conf = { - 'n_mfcc': 20, - 'top_db': 80.0, -} -mfcc_conf.update(mel_conf) - -mel_conf_torchaudio = { - 'sample_rate': sr, - 'n_fft': 512, - 'hop_length': 128, - 'n_mels': 40, - 'norm': 'slaney', - 'mel_scale': 'slaney', -} -mfcc_conf_torchaudio = { - 'sample_rate': sr, - 'n_mfcc': 20, -} - - -def enable_cpu_device(): - paddle.set_device('cpu') - - -def enable_gpu_device(): - paddle.set_device('gpu') - - -mfcc_extractor = paddlespeech.audio.features.MFCC( - **mfcc_conf, f_min=0.0, dtype=waveform_tensor.dtype) - - -def mfcc(): - return mfcc_extractor(waveform_tensor).squeeze(0) - - -def test_mfcc_cpu(benchmark): - enable_cpu_device() - feature_audio = benchmark(mfcc) - feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -def test_mfcc_gpu(benchmark): - enable_gpu_device() - feature_audio = benchmark(mfcc) - feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -del mel_conf_torchaudio['sample_rate'] -mfcc_extractor_torchaudio = torchaudio.transforms.MFCC( - **mfcc_conf_torchaudio, melkwargs=mel_conf_torchaudio) - - -def mfcc_torchaudio(): - return mfcc_extractor_torchaudio(waveform_tensor_torch).squeeze(0) - - -def test_mfcc_cpu_torchaudio(benchmark): - global waveform_tensor_torch, mfcc_extractor_torchaudio - - mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cpu') - waveform_tensor_torch = waveform_tensor_torch.to('cpu') - - feature_audio = benchmark(mfcc_torchaudio) - feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_audio, decimal=3) - - -def test_mfcc_gpu_torchaudio(benchmark): - global waveform_tensor_torch, mfcc_extractor_torchaudio - - mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cuda') - waveform_tensor_torch = waveform_tensor_torch.to('cuda') - - feature_torchaudio = benchmark(mfcc_torchaudio) - feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) - np.testing.assert_array_almost_equal( - feature_librosa, feature_torchaudio.cpu(), decimal=3) diff --git a/tests/unit/audio/backends/soundfile/__init__.py b/tests/unit/audio/backends/soundfile/__init__.py deleted file mode 100644 index 97043fd7b..000000000 --- a/tests/unit/audio/backends/soundfile/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/audio/backends/soundfile/common.py b/tests/unit/audio/backends/soundfile/common.py deleted file mode 100644 index 7067b4a98..000000000 --- a/tests/unit/audio/backends/soundfile/common.py +++ /dev/null @@ -1,57 +0,0 @@ -import itertools -from unittest import skipIf - -from parameterized import parameterized -from paddlespeech.audio._internal.module_utils import is_module_available - - -def name_func(func, _, params): - return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}' - - -def dtype2subtype(dtype): - return { - "float64": "DOUBLE", - "float32": "FLOAT", - "int32": "PCM_32", - "int16": "PCM_16", - "uint8": "PCM_U8", - "int8": "PCM_S8", - }[dtype] - - -def skipIfFormatNotSupported(fmt): - fmts = [] - if is_module_available("soundfile"): - import soundfile - - fmts = soundfile.available_formats() - return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile') - return skipIf(True, '"soundfile" not available.') - - -def parameterize(*params): - return parameterized.expand(list(itertools.product(*params)), name_func=name_func) - - -def fetch_wav_subtype(dtype, encoding, bits_per_sample): - subtype = { - (None, None): dtype2subtype(dtype), - (None, 8): "PCM_U8", - ("PCM_U", None): "PCM_U8", - ("PCM_U", 8): "PCM_U8", - ("PCM_S", None): "PCM_32", - ("PCM_S", 16): "PCM_16", - ("PCM_S", 32): "PCM_32", - ("PCM_F", None): "FLOAT", - ("PCM_F", 32): "FLOAT", - ("PCM_F", 64): "DOUBLE", - ("ULAW", None): "ULAW", - ("ULAW", 8): "ULAW", - ("ALAW", None): "ALAW", - ("ALAW", 8): "ALAW", - }.get((encoding, bits_per_sample)) - if subtype: - return subtype - raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).") - diff --git a/tests/unit/audio/backends/soundfile/info_test.py b/tests/unit/audio/backends/soundfile/info_test.py deleted file mode 100644 index c94826858..000000000 --- a/tests/unit/audio/backends/soundfile/info_test.py +++ /dev/null @@ -1,199 +0,0 @@ -#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py - -import tarfile -import warnings -import unittest -from unittest.mock import patch - -import paddle -from paddlespeech.audio._internal import module_utils as _mod_utils -from paddlespeech.audio.backends import soundfile_backend -from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding -from tests.unit.common_utils import ( - get_wav_data, - nested_params, - save_wav, - TempDirMixin, -) - -from common import parameterize, skipIfFormatNotSupported - -import soundfile - - -class TestInfo(TempDirMixin, unittest.TestCase): - @parameterize( - ["float32", "int32"], - [8000, 16000], - [1, 2], - ) - def test_wav(self, dtype, sample_rate, num_channels): - """`soundfile_backend.info` can check wav file correctly""" - duration = 1 - path = self.get_temp_path("data.wav") - data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate) - save_wav(path, data, sample_rate) - info = soundfile_backend.info(path) - assert info.sample_rate == sample_rate - assert info.num_frames == sample_rate * duration - assert info.num_channels == num_channels - assert info.bits_per_sample == get_bits_per_sample("wav", dtype) - assert info.encoding == get_encoding("wav", dtype) - - @parameterize([8000, 16000], [1, 2]) - @skipIfFormatNotSupported("FLAC") - def test_flac(self, sample_rate, num_channels): - """`soundfile_backend.info` can check flac file correctly""" - duration = 1 - num_frames = sample_rate * duration - #data = torch.randn(num_frames, num_channels).numpy() - data = paddle.randn(shape=[num_frames, num_channels]).numpy() - - path = self.get_temp_path("data.flac") - soundfile.write(path, data, sample_rate) - - info = soundfile_backend.info(path) - assert info.sample_rate == sample_rate - assert info.num_frames == num_frames - assert info.num_channels == num_channels - assert info.bits_per_sample == 16 - assert info.encoding == "FLAC" - - #@parameterize([8000, 16000], [1, 2]) - #@skipIfFormatNotSupported("OGG") - #def test_ogg(self, sample_rate, num_channels): - #"""`soundfile_backend.info` can check ogg file correctly""" - #duration = 1 - #num_frames = sample_rate * duration - ##data = torch.randn(num_frames, num_channels).numpy() - #data = paddle.randn(shape=[num_frames, num_channels]).numpy() - #print(len(data)) - #path = self.get_temp_path("data.ogg") - #soundfile.write(path, data, sample_rate) - - #info = soundfile_backend.info(path) - #print(info) - #assert info.sample_rate == sample_rate - #print("info") - #print(info.num_frames) - #print("jiji") - #print(sample_rate*duration) - ##assert info.num_frames == sample_rate * duration - #assert info.num_channels == num_channels - #assert info.bits_per_sample == 0 - #assert info.encoding == "VORBIS" - - @nested_params( - [8000, 16000], - [1, 2], - [("PCM_24", 24), ("PCM_32", 32)], - ) - @skipIfFormatNotSupported("NIST") - def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth): - """`soundfile_backend.info` can check sph file correctly""" - duration = 1 - num_frames = sample_rate * duration - #data = torch.randn(num_frames, num_channels).numpy() - data = paddle.randn(shape=[num_frames, num_channels]).numpy() - path = self.get_temp_path("data.nist") - subtype, bits_per_sample = subtype_and_bit_depth - soundfile.write(path, data, sample_rate, subtype=subtype) - - info = soundfile_backend.info(path) - assert info.sample_rate == sample_rate - assert info.num_frames == sample_rate * duration - assert info.num_channels == num_channels - assert info.bits_per_sample == bits_per_sample - assert info.encoding == "PCM_S" - - def test_unknown_subtype_warning(self): - """soundfile_backend.info issues a warning when the subtype is unknown - - This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE - dict should be updated. - """ - - def _mock_info_func(_): - class MockSoundFileInfo: - samplerate = 8000 - frames = 356 - channels = 2 - subtype = "UNSEEN_SUBTYPE" - format = "UNKNOWN" - - return MockSoundFileInfo() - - with patch("soundfile.info", _mock_info_func): - with warnings.catch_warnings(record=True) as w: - info = soundfile_backend.info("foo") - assert len(w) == 1 - assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(w[-1].message) - assert info.bits_per_sample == 0 - - -class TestFileObject(TempDirMixin, unittest.TestCase): - def _test_fileobj(self, ext, subtype, bits_per_sample): - """Query audio via file-like object works""" - duration = 2 - sample_rate = 16000 - num_channels = 2 - num_frames = sample_rate * duration - path = self.get_temp_path(f"test.{ext}") - - #data = torch.randn(num_frames, num_channels).numpy() - data = paddle.randn(shape=[num_frames, num_channels]).numpy() - soundfile.write(path, data, sample_rate, subtype=subtype) - - with open(path, "rb") as fileobj: - info = soundfile_backend.info(fileobj) - assert info.sample_rate == sample_rate - assert info.num_frames == num_frames - assert info.num_channels == num_channels - assert info.bits_per_sample == bits_per_sample - assert info.encoding == "FLAC" if ext == "flac" else "PCM_S" - - def test_fileobj_wav(self): - """Loading audio via file-like object works""" - self._test_fileobj("wav", "PCM_16", 16) - - @skipIfFormatNotSupported("FLAC") - def test_fileobj_flac(self): - """Loading audio via file-like object works""" - self._test_fileobj("flac", "PCM_16", 16) - - def _test_tarobj(self, ext, subtype, bits_per_sample): - """Query compressed audio via file-like object works""" - duration = 2 - sample_rate = 16000 - num_channels = 2 - num_frames = sample_rate * duration - audio_file = f"test.{ext}" - audio_path = self.get_temp_path(audio_file) - archive_path = self.get_temp_path("archive.tar.gz") - - #data = torch.randn(num_frames, num_channels).numpy() - data = paddle.randn(shape=[num_frames, num_channels]).numpy() - soundfile.write(audio_path, data, sample_rate, subtype=subtype) - - with tarfile.TarFile(archive_path, "w") as tarobj: - tarobj.add(audio_path, arcname=audio_file) - with tarfile.TarFile(archive_path, "r") as tarobj: - fileobj = tarobj.extractfile(audio_file) - info = soundfile_backend.info(fileobj) - assert info.sample_rate == sample_rate - assert info.num_frames == num_frames - assert info.num_channels == num_channels - assert info.bits_per_sample == bits_per_sample - assert info.encoding == "FLAC" if ext == "flac" else "PCM_S" - - def test_tarobj_wav(self): - """Query compressed audio via file-like object works""" - self._test_tarobj("wav", "PCM_16", 16) - - @skipIfFormatNotSupported("FLAC") - def test_tarobj_flac(self): - """Query compressed audio via file-like object works""" - self._test_tarobj("flac", "PCM_16", 16) - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/soundfile/load_test.py b/tests/unit/audio/backends/soundfile/load_test.py deleted file mode 100644 index 626009382..000000000 --- a/tests/unit/audio/backends/soundfile/load_test.py +++ /dev/null @@ -1,369 +0,0 @@ -#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py - -import os -import tarfile -import unittest -from unittest.mock import patch -import numpy as np - -from parameterized import parameterized -import paddle -from paddlespeech.audio._internal import module_utils as _mod_utils -from paddlespeech.audio.backends import soundfile_backend -from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding -from tests.unit.common_utils import ( - get_wav_data, - load_wav, - nested_params, - normalize_wav, - save_wav, - TempDirMixin, -) - -from common import dtype2subtype, parameterize, skipIfFormatNotSupported - -import soundfile - - -def _get_mock_path( - ext: str, - dtype: str, - sample_rate: int, - num_channels: int, - num_frames: int, -): - return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}" - - -def _get_mock_params(path: str): - filename, ext = path.split(".") - parts = filename.split("_") - return { - "ext": ext, - "dtype": parts[0], - "sample_rate": int(parts[1]), - "num_channels": int(parts[2]), - "num_frames": int(parts[3]), - } - - -class SoundFileMock: - def __init__(self, path, mode): - assert mode == "r" - self.path = path - self._params = _get_mock_params(path) - self._start = None - - @property - def samplerate(self): - return self._params["sample_rate"] - - @property - def format(self): - if self._params["ext"] == "wav": - return "WAV" - if self._params["ext"] == "flac": - return "FLAC" - if self._params["ext"] == "ogg": - return "OGG" - if self._params["ext"] in ["sph", "nis", "nist"]: - return "NIST" - - @property - def subtype(self): - if self._params["ext"] == "ogg": - return "VORBIS" - return dtype2subtype(self._params["dtype"]) - - def _prepare_read(self, start, stop, frames): - assert stop is None - self._start = start - return frames - - def read(self, frames, dtype, always_2d): - assert always_2d - data = get_wav_data( - dtype, - self._params["num_channels"], - normalize=False, - num_frames=self._params["num_frames"], - channels_first=False, - ).numpy() - return data[self._start : self._start + frames] - - def __enter__(self): - return self - - def __exit__(self, *args, **kwargs): - pass - - -class MockedLoadTest(unittest.TestCase): - def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize, channels_first): - """When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32""" - num_frames = 3 * sample_rate - path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames) - expected_dtype = paddle.float32 if normalize or ext not in ["wav", "nist"] else getattr(paddle, dtype) - with patch("soundfile.SoundFile", SoundFileMock): - found, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first) - assert found.dtype == expected_dtype - assert sample_rate == sr - - @parameterize( - ["int32", "float32", "float64"], - [8000, 16000], - [1, 2], - [True, False], - [True, False], - ) - def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first): - """Returns native dtype when normalize=False else float32""" - self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize, channels_first) - - @parameterize( - ["int32"], - [8000, 16000], - [1, 2], - [True, False], - [True, False], - ) - def test_sphere(self, dtype, sample_rate, num_channels, normalize, channels_first): - """Returns float32 always""" - self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize, channels_first) - - @parameterize([8000, 16000], [1, 2], [True, False], [True, False]) - def test_ogg(self, sample_rate, num_channels, normalize, channels_first): - """Returns float32 always""" - self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize, channels_first) - - @parameterize([8000, 16000], [1, 2], [True, False], [True, False]) - def test_flac(self, sample_rate, num_channels, normalize, channels_first): - """`soundfile_backend.load` can load ogg format.""" - self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize, channels_first) - - -class LoadTestBase(TempDirMixin, unittest.TestCase): - def assert_wav( - self, - dtype, - sample_rate, - num_channels, - normalize, - channels_first=True, - duration=1, - ): - """`soundfile_backend.load` can load wav format correctly. - - Wav data loaded with soundfile backend should match those with scipy - """ - path = self.get_temp_path("reference.wav") - num_frames = duration * sample_rate - data = get_wav_data( - dtype, - num_channels, - normalize=normalize, - num_frames=num_frames, - channels_first=channels_first, - ) - save_wav(path, data, sample_rate, channels_first=channels_first) - expected = load_wav(path, normalize=normalize, channels_first=channels_first)[0] - data, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first) - assert sr == sample_rate - np.testing.assert_array_almost_equal(data.numpy(), expected.numpy()) - - def assert_sphere( - self, - dtype, - sample_rate, - num_channels, - channels_first=True, - duration=1, - ): - """`soundfile_backend.load` can load SPHERE format correctly.""" - path = self.get_temp_path("reference.sph") - num_frames = duration * sample_rate - raw = get_wav_data( - dtype, - num_channels, - num_frames=num_frames, - normalize=False, - channels_first=False, - ) - soundfile.write(path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST") - expected = normalize_wav(raw.t() if channels_first else raw) - data, sr = soundfile_backend.load(path, channels_first=channels_first) - assert sr == sample_rate - #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8) - np.testing.assert_array_almost_equal(data.numpy(), expected.numpy()) - - def assert_flac( - self, - dtype, - sample_rate, - num_channels, - channels_first=True, - duration=1, - ): - """`soundfile_backend.load` can load FLAC format correctly.""" - path = self.get_temp_path("reference.flac") - num_frames = duration * sample_rate - raw = get_wav_data( - dtype, - num_channels, - num_frames=num_frames, - normalize=False, - channels_first=False, - ) - soundfile.write(path, raw, sample_rate) - expected = normalize_wav(raw.t() if channels_first else raw) - data, sr = soundfile_backend.load(path, channels_first=channels_first) - assert sr == sample_rate - #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8) - np.testing.assert_array_almost_equal(data.numpy(), expected.numpy()) - - - -class TestLoad(LoadTestBase): - """Test the correctness of `soundfile_backend.load` for various formats""" - - @parameterize( - ["float32", "int32"], - [8000, 16000], - [1, 2], - [False, True], - [False, True], - ) - def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first): - """`soundfile_backend.load` can load wav format correctly.""" - self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first) - - @parameterize( - ["int32"], - [16000], - [2], - [False], - ) - def test_wav_large(self, dtype, sample_rate, num_channels, normalize): - """`soundfile_backend.load` can load large wav file correctly.""" - two_hours = 2 * 60 * 60 - self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=two_hours) - - @parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True]) - def test_multiple_channels(self, dtype, num_channels, channels_first): - """`soundfile_backend.load` can load wav file with more than 2 channels.""" - sample_rate = 8000 - normalize = False - self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first) - - #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True]) - #@skipIfFormatNotSupported("NIST") - #def test_sphere(self, dtype, sample_rate, num_channels, channels_first): - #"""`soundfile_backend.load` can load sphere format correctly.""" - #self.assert_sphere(dtype, sample_rate, num_channels, channels_first) - - #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True]) - #@skipIfFormatNotSupported("FLAC") - #def test_flac(self, dtype, sample_rate, num_channels, channels_first): - #"""`soundfile_backend.load` can load flac format correctly.""" - #self.assert_flac(dtype, sample_rate, num_channels, channels_first) - - -class TestLoadFormat(TempDirMixin, unittest.TestCase): - """Given `format` parameter, `so.load` can load files without extension""" - - original = None - path = None - - def _make_file(self, format_): - sample_rate = 8000 - path_with_ext = self.get_temp_path(f"test.{format_}") - data = get_wav_data("float32", num_channels=2).numpy().T - soundfile.write(path_with_ext, data, sample_rate) - expected = soundfile.read(path_with_ext, dtype="float32")[0].T - path = os.path.splitext(path_with_ext)[0] - os.rename(path_with_ext, path) - return path, expected - - def _test_format(self, format_): - """Providing format allows to read file without extension""" - path, expected = self._make_file(format_) - found, _ = soundfile_backend.load(path) - #self.assertEqual(found, expected) - np.testing.assert_array_almost_equal(found, expected) - - @parameterized.expand( - [ - ("WAV",), - ("wav",), - ] - ) - def test_wav(self, format_): - self._test_format(format_) - - @parameterized.expand( - [ - ("FLAC",), - ("flac",), - ] - ) - @skipIfFormatNotSupported("FLAC") - def test_flac(self, format_): - self._test_format(format_) - - -class TestFileObject(TempDirMixin, unittest.TestCase): - def _test_fileobj(self, ext): - """Loading audio via file-like object works""" - sample_rate = 16000 - path = self.get_temp_path(f"test.{ext}") - - data = get_wav_data("float32", num_channels=2).numpy().T - soundfile.write(path, data, sample_rate) - expected = soundfile.read(path, dtype="float32")[0].T - - with open(path, "rb") as fileobj: - found, sr = soundfile_backend.load(fileobj) - assert sr == sample_rate - #self.assertEqual(expected, found) - np.testing.assert_array_almost_equal(found, expected) - - def test_fileobj_wav(self): - """Loading audio via file-like object works""" - self._test_fileobj("wav") - - def test_fileobj_flac(self): - """Loading audio via file-like object works""" - self._test_fileobj("flac") - - def _test_tarfile(self, ext): - """Loading audio via file-like object works""" - sample_rate = 16000 - audio_file = f"test.{ext}" - audio_path = self.get_temp_path(audio_file) - archive_path = self.get_temp_path("archive.tar.gz") - - data = get_wav_data("float32", num_channels=2).numpy().T - soundfile.write(audio_path, data, sample_rate) - expected = soundfile.read(audio_path, dtype="float32")[0].T - - with tarfile.TarFile(archive_path, "w") as tarobj: - tarobj.add(audio_path, arcname=audio_file) - with tarfile.TarFile(archive_path, "r") as tarobj: - fileobj = tarobj.extractfile(audio_file) - found, sr = soundfile_backend.load(fileobj) - - assert sr == sample_rate - #self.assertEqual(expected, found) - np.testing.assert_array_almost_equal(found.numpy(), expected) - - - def test_tarfile_wav(self): - """Loading audio via file-like object works""" - self._test_tarfile("wav") - - def test_tarfile_flac(self): - """Loading audio via file-like object works""" - self._test_tarfile("flac") - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/soundfile/save_test.py b/tests/unit/audio/backends/soundfile/save_test.py deleted file mode 100644 index 9139d84cd..000000000 --- a/tests/unit/audio/backends/soundfile/save_test.py +++ /dev/null @@ -1,322 +0,0 @@ -import io -import unittest -from unittest.mock import patch - -from paddlespeech.audio._internal import module_utils as _mod_utils -from paddlespeech.audio.backends import soundfile_backend -from tests.unit.common_utils import ( - get_wav_data, - load_wav, - nested_params, - normalize_wav, - save_wav, - TempDirMixin, -) - -from common import fetch_wav_subtype, parameterize, skipIfFormatNotSupported - -import paddle -import numpy as np - -import soundfile - - -class MockedSaveTest(unittest.TestCase): - @nested_params( - ["float32", "int32"], - [8000, 16000], - [1, 2], - [False, True], - [ - (None, None), - ("PCM_U", None), - ("PCM_U", 8), - ("PCM_S", None), - ("PCM_S", 16), - ("PCM_S", 32), - ("PCM_F", None), - ("PCM_F", 32), - ("PCM_F", 64), - ("ULAW", None), - ("ULAW", 8), - ("ALAW", None), - ("ALAW", 8), - ], - ) - @patch("soundfile.write") - def test_wav(self, dtype, sample_rate, num_channels, channels_first, enc_params, mocked_write): - """soundfile_backend.save passes correct subtype to soundfile.write when WAV""" - filepath = "foo.wav" - input_tensor = get_wav_data( - dtype, - num_channels, - num_frames=3 * sample_rate, - normalize=dtype == "float32", - channels_first=channels_first, - ) - input_tensor = paddle.transpose(input_tensor, [1, 0]) - - encoding, bits_per_sample = enc_params - soundfile_backend.save( - filepath, - input_tensor, - sample_rate, - channels_first=channels_first, - encoding=encoding, - bits_per_sample=bits_per_sample, - ) - - # on +Py3.8 call_args.kwargs is more descreptive - args = mocked_write.call_args[1] - assert args["file"] == filepath - assert args["samplerate"] == sample_rate - assert args["subtype"] == fetch_wav_subtype(dtype, encoding, bits_per_sample) - assert args["format"] is None - tensor_result = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor - #self.assertEqual(args["data"], tensor_result.numpy()) - np.testing.assert_array_almost_equal(args["data"].numpy(), tensor_result.numpy()) - - - - @patch("soundfile.write") - def assert_non_wav( - self, - fmt, - dtype, - sample_rate, - num_channels, - channels_first, - mocked_write, - encoding=None, - bits_per_sample=None, - ): - """soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE""" - filepath = f"foo.{fmt}" - input_tensor = get_wav_data( - dtype, - num_channels, - num_frames=3 * sample_rate, - normalize=False, - channels_first=channels_first, - ) - input_tensor = paddle.transpose(input_tensor, [1, 0]) - - expected_data = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor - - soundfile_backend.save( - filepath, - input_tensor, - sample_rate, - channels_first, - encoding=encoding, - bits_per_sample=bits_per_sample, - ) - - # on +Py3.8 call_args.kwargs is more descreptive - args = mocked_write.call_args[1] - assert args["file"] == filepath - assert args["samplerate"] == sample_rate - if fmt in ["sph", "nist", "nis"]: - assert args["format"] == "NIST" - else: - assert args["format"] is None - np.testing.assert_array_almost_equal(args["data"].numpy(), expected_data.numpy()) - #self.assertEqual(args["data"], expected_data) - - @nested_params( - ["sph", "nist", "nis"], - ["int32"], - [8000, 16000], - [1, 2], - [False, True], - [ - ("PCM_S", 8), - ("PCM_S", 16), - ("PCM_S", 24), - ("PCM_S", 32), - ("ULAW", 8), - ("ALAW", 8), - ("ALAW", 16), - ("ALAW", 24), - ("ALAW", 32), - ], - ) - def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first, enc_params): - """soundfile_backend.save passes default format and subtype (None-s) to - soundfile.write when not WAV""" - encoding, bits_per_sample = enc_params - self.assert_non_wav( - fmt, dtype, sample_rate, num_channels, channels_first, encoding=encoding, bits_per_sample=bits_per_sample - ) - - @parameterize( - ["int32"], - [8000, 16000], - [1, 2], - [False, True], - [8, 16, 24], - ) - def test_flac(self, dtype, sample_rate, num_channels, channels_first, bits_per_sample): - """soundfile_backend.save passes default format and subtype (None-s) to - soundfile.write when not WAV""" - self.assert_non_wav("flac", dtype, sample_rate, num_channels, channels_first, bits_per_sample=bits_per_sample) - - @parameterize( - ["int32"], - [8000, 16000], - [1, 2], - [False, True], - ) - def test_ogg(self, dtype, sample_rate, num_channels, channels_first): - """soundfile_backend.save passes default format and subtype (None-s) to - soundfile.write when not WAV""" - self.assert_non_wav("ogg", dtype, sample_rate, num_channels, channels_first) - - -class SaveTestBase(TempDirMixin, unittest.TestCase): - def assert_wav(self, dtype, sample_rate, num_channels, num_frames): - """`soundfile_backend.save` can save wav format.""" - path = self.get_temp_path("data.wav") - expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False) - soundfile_backend.save(path, expected, sample_rate) - found, sr = load_wav(path, normalize=False) - assert sample_rate == sr - #self.assertEqual(found, expected) - np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) - - def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels): - """`soundfile_backend.save` can save non-wav format. - - Due to precision missmatch, and the lack of alternative way to decode the - resulting files without using soundfile, only meta data are validated. - """ - num_frames = sample_rate * 3 - path = self.get_temp_path(f"data.{fmt}") - expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False) - soundfile_backend.save(path, expected, sample_rate) - sinfo = soundfile.info(path) - assert sinfo.format == fmt.upper() - #assert sinfo.frames == num_frames this go wrong - assert sinfo.channels == num_channels - assert sinfo.samplerate == sample_rate - - def assert_flac(self, dtype, sample_rate, num_channels): - """`soundfile_backend.save` can save flac format.""" - self._assert_non_wav("flac", dtype, sample_rate, num_channels) - - def assert_sphere(self, dtype, sample_rate, num_channels): - """`soundfile_backend.save` can save sph format.""" - self._assert_non_wav("nist", dtype, sample_rate, num_channels) - - def assert_ogg(self, dtype, sample_rate, num_channels): - """`soundfile_backend.save` can save ogg format. - - As we cannot inspect the OGG format (it's lossy), we only check the metadata. - """ - self._assert_non_wav("ogg", dtype, sample_rate, num_channels) - - -class TestSave(SaveTestBase): - @parameterize( - ["float32", "int32"], - [8000, 16000], - [1, 2], - ) - def test_wav(self, dtype, sample_rate, num_channels): - """`soundfile_backend.save` can save wav format.""" - self.assert_wav(dtype, sample_rate, num_channels, num_frames=None) - - @parameterize( - ["float32", "int32"], - [4, 8, 16, 32], - ) - def test_multiple_channels(self, dtype, num_channels): - """`soundfile_backend.save` can save wav with more than 2 channels.""" - sample_rate = 8000 - self.assert_wav(dtype, sample_rate, num_channels, num_frames=None) - - @parameterize( - ["int32"], - [8000, 16000], - [1, 2], - ) - @skipIfFormatNotSupported("NIST") - def test_sphere(self, dtype, sample_rate, num_channels): - """`soundfile_backend.save` can save sph format.""" - self.assert_sphere(dtype, sample_rate, num_channels) - - @parameterize( - [8000, 16000], - [1, 2], - ) - @skipIfFormatNotSupported("FLAC") - def test_flac(self, sample_rate, num_channels): - """`soundfile_backend.save` can save flac format.""" - self.assert_flac("float32", sample_rate, num_channels) - - @parameterize( - [8000, 16000], - [1, 2], - ) - @skipIfFormatNotSupported("OGG") - def test_ogg(self, sample_rate, num_channels): - """`soundfile_backend.save` can save ogg/vorbis format.""" - self.assert_ogg("float32", sample_rate, num_channels) - - -class TestSaveParams(TempDirMixin, unittest.TestCase): - """Test the correctness of optional parameters of `soundfile_backend.save`""" - - @parameterize([True, False]) - def test_channels_first(self, channels_first): - """channels_first swaps axes""" - path = self.get_temp_path("data.wav") - data = get_wav_data("int32", 2, channels_first=channels_first) - soundfile_backend.save(path, data, 8000, channels_first=channels_first) - found = load_wav(path)[0] - expected = data if channels_first else data.transpose([1, 0]) - #self.assertEqual(found, expected, atol=1e-4, rtol=1e-8) - np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) - - -class TestFileObject(TempDirMixin, unittest.TestCase): - def _test_fileobj(self, ext): - """Saving audio to file-like object works""" - sample_rate = 16000 - path = self.get_temp_path(f"test.{ext}") - - subtype = "FLOAT" if ext == "wav" else None - data = get_wav_data("float32", num_channels=2) - soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype) - expected = soundfile.read(path, dtype="float32")[0] - - fileobj = io.BytesIO() - soundfile_backend.save(fileobj, data, sample_rate, format=ext) - fileobj.seek(0) - found, sr = soundfile.read(fileobj, dtype="float32") - - assert sr == sample_rate - #self.assertEqual(expected, found, atol=1e-4, rtol=1e-8) - np.testing.assert_array_almost_equal(found, expected) - - def test_fileobj_wav(self): - """Saving audio via file-like object works""" - self._test_fileobj("wav") - - @skipIfFormatNotSupported("FLAC") - def test_fileobj_flac(self): - """Saving audio via file-like object works""" - self._test_fileobj("flac") - - @skipIfFormatNotSupported("NIST") - def test_fileobj_nist(self): - """Saving audio via file-like object works""" - self._test_fileobj("NIST") - - @skipIfFormatNotSupported("OGG") - def test_fileobj_ogg(self): - """Saving audio via file-like object works""" - self._test_fileobj("OGG") - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/unit/audio/backends/soundfile/test_io.py b/tests/unit/audio/backends/soundfile/test_io.py deleted file mode 100644 index 26276751f..000000000 --- a/tests/unit/audio/backends/soundfile/test_io.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import filecmp -import os -import unittest - -import numpy as np -import soundfile as sf - -import paddlespeech.audio -from ..base import BackendTest - - -class TestIO(BackendTest): - def test_load_mono_channel(self): - sf_data, sf_sr = sf.read(self.files[0]) - pa_data, pa_sr = paddlespeech.audio.load( - self.files[0], normal=False, dtype='float64') - - self.assertEqual(sf_data.dtype, pa_data.dtype) - self.assertEqual(sf_sr, pa_sr) - np.testing.assert_array_almost_equal(sf_data, pa_data) - - def test_load_multi_channels(self): - sf_data, sf_sr = sf.read(self.files[1]) - sf_data = sf_data.T # Channel dim first - pa_data, pa_sr = paddlespeech.audio.load( - self.files[1], mono=False, normal=False, dtype='float64') - - self.assertEqual(sf_data.dtype, pa_data.dtype) - self.assertEqual(sf_sr, pa_sr) - np.testing.assert_array_almost_equal(sf_data, pa_data) - - def test_save_mono_channel(self): - waveform, sr = np.random.randint( - low=-32768, high=32768, size=(48000), dtype=np.int16), 16000 - sf_tmp_file = 'sf_tmp.wav' - pa_tmp_file = 'pa_tmp.wav' - - sf.write(sf_tmp_file, waveform, sr) - paddlespeech.audio.save(waveform, sr, pa_tmp_file) - - self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file)) - for file in [sf_tmp_file, pa_tmp_file]: - os.remove(file) - - def test_save_multi_channels(self): - waveform, sr = np.random.randint( - low=-32768, high=32768, size=(2, 48000), dtype=np.int16), 16000 - sf_tmp_file = 'sf_tmp.wav' - pa_tmp_file = 'pa_tmp.wav' - - sf.write(sf_tmp_file, waveform.T, sr) - paddlespeech.audio.save(waveform.T, sr, pa_tmp_file) - - self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file)) - for file in [sf_tmp_file, pa_tmp_file]: - os.remove(file) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/unit/audio/features/base.py b/tests/unit/audio/features/base.py index 4049b6102..614fce28c 100644 --- a/tests/unit/audio/features/base.py +++ b/tests/unit/audio/features/base.py @@ -18,7 +18,7 @@ import urllib.request import numpy as np import paddle -from paddlespeech.audio.soundfile_backend import soundfile_load as load +from paddleaudio.backends import soundfile_load as load wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' diff --git a/tests/unit/audio/features/test_istft.py b/tests/unit/audio/features/test_istft.py index f1e6e4e33..23371200b 100644 --- a/tests/unit/audio/features/test_istft.py +++ b/tests/unit/audio/features/test_istft.py @@ -17,7 +17,7 @@ import numpy as np import paddle from .base import FeatTest -from paddlespeech.audio.functional.window import get_window +from paddleaudio.functional.window import get_window from paddlespeech.s2t.transform.spectrogram import IStft from paddlespeech.s2t.transform.spectrogram import Stft diff --git a/tests/unit/audio/features/test_kaldi.py b/tests/unit/audio/features/test_kaldi.py deleted file mode 100644 index 2b0ece890..000000000 --- a/tests/unit/audio/features/test_kaldi.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -import numpy as np -import paddle -import torch -import torchaudio - -import paddlespeech.audio -from .base import FeatTest - - -class TestKaldi(FeatTest): - def initParmas(self): - self.window_size = 1024 - self.dtype = 'float32' - - def test_window(self): - t_hann_window = torch.hann_window( - self.window_size, periodic=False, dtype=eval(f'torch.{self.dtype}')) - t_hamm_window = torch.hamming_window( - self.window_size, - periodic=False, - alpha=0.54, - beta=0.46, - dtype=eval(f'torch.{self.dtype}')) - t_povey_window = torch.hann_window( - self.window_size, periodic=False, - dtype=eval(f'torch.{self.dtype}')).pow(0.85) - - p_hann_window = paddlespeech.audio.functional.window.get_window( - 'hann', - self.window_size, - fftbins=False, - dtype=eval(f'paddle.{self.dtype}')) - p_hamm_window = paddlespeech.audio.functional.window.get_window( - 'hamming', - self.window_size, - fftbins=False, - dtype=eval(f'paddle.{self.dtype}')) - p_povey_window = paddlespeech.audio.functional.window.get_window( - 'hann', - self.window_size, - fftbins=False, - dtype=eval(f'paddle.{self.dtype}')).pow(0.85) - - np.testing.assert_array_almost_equal(t_hann_window, p_hann_window) - np.testing.assert_array_almost_equal(t_hamm_window, p_hamm_window) - np.testing.assert_array_almost_equal(t_povey_window, p_povey_window) - - def test_fbank(self): - ta_features = torchaudio.compliance.kaldi.fbank( - torch.from_numpy(self.waveform.astype(self.dtype))) - pa_features = paddlespeech.audio.compliance.kaldi.fbank( - paddle.to_tensor(self.waveform.astype(self.dtype))) - np.testing.assert_array_almost_equal( - ta_features, pa_features, decimal=4) - - def test_mfcc(self): - ta_features = torchaudio.compliance.kaldi.mfcc( - torch.from_numpy(self.waveform.astype(self.dtype))) - pa_features = paddlespeech.audio.compliance.kaldi.mfcc( - paddle.to_tensor(self.waveform.astype(self.dtype))) - np.testing.assert_array_almost_equal( - ta_features, pa_features, decimal=4) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/unit/audio/features/test_librosa.py b/tests/unit/audio/features/test_librosa.py deleted file mode 100644 index ffdec3e78..000000000 --- a/tests/unit/audio/features/test_librosa.py +++ /dev/null @@ -1,281 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -import librosa -import numpy as np -import paddle - -import paddlespeech.audio -from .base import FeatTest -from paddlespeech.audio.functional.window import get_window - - -class TestLibrosa(FeatTest): - def initParmas(self): - self.n_fft = 512 - self.hop_length = 128 - self.n_mels = 40 - self.n_mfcc = 20 - self.fmin = 0.0 - self.window_str = 'hann' - self.pad_mode = 'reflect' - self.top_db = 80.0 - - def test_stft(self): - if len(self.waveform.shape) == 2: # (C, T) - self.waveform = self.waveform.squeeze( - 0) # 1D input for librosa.feature.melspectrogram - - feature_librosa = librosa.core.stft( - y=self.waveform, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=None, - window=self.window_str, - center=True, - dtype=None, - pad_mode=self.pad_mode, ) - x = paddle.to_tensor(self.waveform).unsqueeze(0) - window = get_window(self.window_str, self.n_fft, dtype=x.dtype) - feature_paddle = paddle.signal.stft( - x=x, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=None, - window=window, - center=True, - pad_mode=self.pad_mode, - normalized=False, - onesided=True, ).squeeze(0) - - np.testing.assert_array_almost_equal( - feature_librosa, feature_paddle, decimal=5) - - def test_istft(self): - if len(self.waveform.shape) == 2: # (C, T) - self.waveform = self.waveform.squeeze( - 0) # 1D input for librosa.feature.melspectrogram - - # Get stft result from librosa. - stft_matrix = librosa.core.stft( - y=self.waveform, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=None, - window=self.window_str, - center=True, - pad_mode=self.pad_mode, ) - - feature_librosa = librosa.core.istft( - stft_matrix=stft_matrix, - hop_length=self.hop_length, - win_length=None, - window=self.window_str, - center=True, - dtype=None, - length=None, ) - - x = paddle.to_tensor(stft_matrix).unsqueeze(0) - window = get_window( - self.window_str, - self.n_fft, - dtype=paddle.to_tensor(self.waveform).dtype) - feature_paddle = paddle.signal.istft( - x=x, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=None, - window=window, - center=True, - normalized=False, - onesided=True, - length=None, - return_complex=False, ).squeeze(0) - - np.testing.assert_array_almost_equal( - feature_librosa, feature_paddle, decimal=5) - - def test_mel(self): - feature_librosa = librosa.filters.mel( - sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - fmin=self.fmin, - fmax=None, - htk=False, - norm='slaney', - dtype=self.waveform.dtype, ) - feature_compliance = paddlespeech.audio.compliance.librosa.compute_fbank_matrix( - sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - fmin=self.fmin, - fmax=None, - htk=False, - norm='slaney', - dtype=self.waveform.dtype, ) - x = paddle.to_tensor(self.waveform) - feature_functional = paddlespeech.audio.functional.compute_fbank_matrix( - sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - f_min=self.fmin, - f_max=None, - htk=False, - norm='slaney', - dtype=x.dtype, ) - - np.testing.assert_array_almost_equal(feature_librosa, - feature_compliance) - np.testing.assert_array_almost_equal(feature_librosa, - feature_functional) - - def test_melspect(self): - if len(self.waveform.shape) == 2: # (C, T) - self.waveform = self.waveform.squeeze( - 0) # 1D input for librosa.feature.melspectrogram - - # librosa: - feature_librosa = librosa.feature.melspectrogram( - y=self.waveform, - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) - - # paddlespeech.audio.compliance.librosa: - feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram( - x=self.waveform, - sr=self.sr, - window_size=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin, - to_db=False) - - # paddlespeech.audio.features.layer - x = paddle.to_tensor( - self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim. - feature_extractor = paddlespeech.audio.features.MelSpectrogram( - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - f_min=self.fmin, - dtype=x.dtype) - feature_layer = feature_extractor(x).squeeze(0).numpy() - - np.testing.assert_array_almost_equal( - feature_librosa, feature_compliance, decimal=5) - np.testing.assert_array_almost_equal( - feature_librosa, feature_layer, decimal=5) - - def test_log_melspect(self): - if len(self.waveform.shape) == 2: # (C, T) - self.waveform = self.waveform.squeeze( - 0) # 1D input for librosa.feature.melspectrogram - - # librosa: - feature_librosa = librosa.feature.melspectrogram( - y=self.waveform, - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=None) - - # paddlespeech.audio.compliance.librosa: - feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram( - x=self.waveform, - sr=self.sr, - window_size=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) - - # paddlespeech.audio.features.layer - x = paddle.to_tensor( - self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim. - feature_extractor = paddlespeech.audio.features.LogMelSpectrogram( - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - f_min=self.fmin, - dtype=x.dtype) - feature_layer = feature_extractor(x).squeeze(0).numpy() - - np.testing.assert_array_almost_equal( - feature_librosa, feature_compliance, decimal=5) - np.testing.assert_array_almost_equal( - feature_librosa, feature_layer, decimal=4) - - def test_mfcc(self): - if len(self.waveform.shape) == 2: # (C, T) - self.waveform = self.waveform.squeeze( - 0) # 1D input for librosa.feature.melspectrogram - - # librosa: - feature_librosa = librosa.feature.mfcc( - y=self.waveform, - sr=self.sr, - S=None, - n_mfcc=self.n_mfcc, - dct_type=2, - norm='ortho', - lifter=0, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) - - # paddlespeech.audio.compliance.librosa: - feature_compliance = paddlespeech.audio.compliance.librosa.mfcc( - x=self.waveform, - sr=self.sr, - n_mfcc=self.n_mfcc, - dct_type=2, - norm='ortho', - lifter=0, - window_size=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin, - top_db=self.top_db) - - # paddlespeech.audio.features.layer - x = paddle.to_tensor( - self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim. - feature_extractor = paddlespeech.audio.features.MFCC( - sr=self.sr, - n_mfcc=self.n_mfcc, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - f_min=self.fmin, - top_db=self.top_db, - dtype=x.dtype) - feature_layer = feature_extractor(x).squeeze(0).numpy() - - np.testing.assert_array_almost_equal( - feature_librosa, feature_compliance, decimal=4) - np.testing.assert_array_almost_equal( - feature_librosa, feature_layer, decimal=4) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/unit/audio/features/test_log_melspectrogram.py b/tests/unit/audio/features/test_log_melspectrogram.py index 59eb73e8c..0c38de22c 100644 --- a/tests/unit/audio/features/test_log_melspectrogram.py +++ b/tests/unit/audio/features/test_log_melspectrogram.py @@ -16,7 +16,7 @@ import unittest import numpy as np import paddle -import paddlespeech.audio +import paddleaudio from .base import FeatTest from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram @@ -33,7 +33,7 @@ class TestLogMelSpectrogram(FeatTest): ps_res = ps_melspect(self.waveform.T).squeeze(1).T x = paddle.to_tensor(self.waveform) - ps_melspect = paddlespeech.audio.features.LogMelSpectrogram( + ps_melspect = paddleaudio.features.LogMelSpectrogram( self.sr, self.n_fft, self.hop_length, diff --git a/tests/unit/audio/features/test_spectrogram.py b/tests/unit/audio/features/test_spectrogram.py index 7d908a7ef..50b21403b 100644 --- a/tests/unit/audio/features/test_spectrogram.py +++ b/tests/unit/audio/features/test_spectrogram.py @@ -16,7 +16,7 @@ import unittest import numpy as np import paddle -import paddlespeech.audio +import paddleaudio from .base import FeatTest from paddlespeech.s2t.transform.spectrogram import Spectrogram @@ -31,7 +31,7 @@ class TestSpectrogram(FeatTest): ps_res = ps_spect(self.waveform.T).squeeze(1).T # Magnitude x = paddle.to_tensor(self.waveform) - pa_spect = paddlespeech.audio.features.Spectrogram( + pa_spect = paddleaudio.features.Spectrogram( self.n_fft, self.hop_length, power=1.0) pa_res = pa_spect(x).squeeze(0).numpy() diff --git a/tests/unit/audio/features/test_stft.py b/tests/unit/audio/features/test_stft.py index 03448ca80..c64b5ebe6 100644 --- a/tests/unit/audio/features/test_stft.py +++ b/tests/unit/audio/features/test_stft.py @@ -17,7 +17,7 @@ import numpy as np import paddle from .base import FeatTest -from paddlespeech.audio.functional.window import get_window +from paddleaudio.functional.window import get_window from paddlespeech.s2t.transform.spectrogram import Stft