Merge branch 'PaddlePaddle:develop' into develop

pull/3803/head
gmm 1 year ago committed by GitHub
commit ef7ec90ee8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
self.waveform, self.sr = load(os.path.abspath(os.path.basename(url))) self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
self.waveform = self.waveform.astype( self.waveform = self.waveform.astype(
np.float32 np.float32
) # paddlespeech.s2t.transform.spectrogram only supports float32 ) # paddlespeech.audio.transform.spectrogram only supports float32
dim = len(self.waveform.shape) dim = len(self.waveform.shape)
assert dim in [1, 2] assert dim in [1, 2]

@ -18,8 +18,8 @@ import paddle
from paddleaudio.functional.window import get_window from paddleaudio.functional.window import get_window
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import IStft from paddlespeech.audio.transform.spectrogram import IStft
from paddlespeech.s2t.transform.spectrogram import Stft from paddlespeech.audio.transform.spectrogram import Stft
class TestIstft(FeatTest): class TestIstft(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio import paddleaudio
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
class TestLogMelSpectrogram(FeatTest): class TestLogMelSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio import paddleaudio
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Spectrogram from paddlespeech.audio.transform.spectrogram import Spectrogram
class TestSpectrogram(FeatTest): class TestSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
from paddleaudio.functional.window import get_window from paddleaudio.functional.window import get_window
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Stft from paddlespeech.audio.transform.spectrogram import Stft
class TestStft(FeatTest): class TestStft(FeatTest):

@ -18,4 +18,4 @@ This directory contains many speech applications in multiple scenarios.
* style_fs2 - multi style control for FastSpeech2 model * style_fs2 - multi style control for FastSpeech2 model
* text_to_speech - convert text into speech * text_to_speech - convert text into speech
* self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2 * self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2
* Wishper - speech recognize and translate based on Whisper model * Whisper - speech recognize and translate based on Whisper model

@ -236,8 +236,8 @@
"warnings.filterwarnings('ignore')\n", "warnings.filterwarnings('ignore')\n",
"\n", "\n",
"from yacs.config import CfgNode\n", "from yacs.config import CfgNode\n",
"from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n", "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
"from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n", "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
"from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n", "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
"from paddlespeech.s2t.models.u2 import U2Model\n", "from paddlespeech.s2t.models.u2 import U2Model\n",
"\n", "\n",

@ -14,6 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
import io import io
import os import os
import sys
import h5py import h5py
import librosa import librosa
@ -98,7 +99,7 @@ class SoundHDF5File():
def __contains__(self, item): def __contains__(self, item):
return item in self.file return item in self.file
def __len__(self, item): def __len__(self):
return len(self.file) return len(self.file)
def __enter__(self): def __enter__(self):

@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
# fbank # fbank
audio = preprocessing(audio, **preprocess_args) audio = preprocessing(audio, **preprocess_args)
audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0)
audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
self._inputs["audio"] = audio self._inputs["audio"] = audio

@ -14,9 +14,12 @@
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
from collections import OrderedDict from collections import OrderedDict
import io
import os
import kaldiio import kaldiio
import numpy as np import numpy as np
import soundfile import soundfile
import h5py
from .utility import feat_type from .utility import feat_type
from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
@ -401,7 +404,7 @@ class SoundHDF5File():
def __contains__(self, item): def __contains__(self, item):
return item in self.file return item in self.file
def __len__(self, item): def __len__(self):
return len(self.file) return len(self.file)
def __enter__(self): def __enter__(self):

@ -154,7 +154,7 @@ class TTSServerExecutor(TTSExecutor):
self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf) self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf)
logger.debug("Create voc sess successfully.") logger.debug("Create voc sess successfully.")
with open(self.phones_dict, "r") as f: with open(self.phones_dict, "r", encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()] phn_id = [line.strip().split() for line in f.readlines()]
self.vocab_size = len(phn_id) self.vocab_size = len(phn_id)
logger.debug(f"vocab_size: {self.vocab_size}") logger.debug(f"vocab_size: {self.vocab_size}")

@ -237,30 +237,25 @@ class ToneSandhi():
# output seg: [['听一听', 'v']] # output seg: [['听一听', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = [] new_seg = []
skip_next = False
# function 1 # function 1
for i, (word, pos) in enumerate(seg): for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][ if skip_next:
0] == seg[i + 1][0] and seg[i - 1][1] == "v": skip_next = False
if i - 1 < len(new_seg): continue
new_seg[i - if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
1][0] = new_seg[i - 1][0] + "" + new_seg[i - 1][0] new_seg[-1] = (new_seg[-1][0] + "" + seg[i + 1][0], new_seg[-1][1])
else: skip_next = True
new_seg.append([word, pos])
new_seg.append([seg[i + 1][0], pos])
else: else:
if i - 2 >= 0 and seg[i - 1][0] == "" and seg[i - 2][ new_seg.append((word, pos))
0] == word and pos == "v":
continue
else:
new_seg.append([word, pos])
seg = new_seg seg = new_seg
new_seg = [] new_seg = []
# function 2 # function 2
for i, (word, pos) in enumerate(seg): for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == "": if new_seg and new_seg[-1][0] == "":
new_seg[-1][0] = new_seg[-1][0] + word new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
else: else:
new_seg.append([word, pos]) new_seg.append((word, pos))
return new_seg return new_seg
# the first and the second words are all_tone_three # the first and the second words are all_tone_three

@ -53,6 +53,7 @@ base = [
"pandas", "pandas",
"paddleaudio>=1.1.0", "paddleaudio>=1.1.0",
"paddlenlp>=2.4.8", "paddlenlp>=2.4.8",
"paddlepaddle==2.5.1",
"paddleslim>=2.3.4", "paddleslim>=2.3.4",
"ppdiffusers>=0.9.0", "ppdiffusers>=0.9.0",
"paddlespeech_feat", "paddlespeech_feat",

@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
# Speech SSL # Speech SSL
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task asr --lang en --input ./en.wav
paddlespeech ssl --task vector --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav
# Speech_recognition # Speech_recognition
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
paddlespeech asr --input ./zh.wav paddlespeech asr --input ./zh.wav
paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav
paddlespeech asr --model conformer_online_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@ -110,5 +111,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
# whisper recognize text and translate to English # whisper recognize text and translate to English
paddlespeech whisper --task translate --input ./zh.wav paddlespeech whisper --task translate --input ./zh.wav
# to change model English-Only model
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
echo -e "\033[32mTest success !!!\033[0m" echo -e "\033[32mTest success !!!\033[0m"

@ -0,0 +1,29 @@
# test CLI 测试文档
该文档为 CLI 测试说明,该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
# 测试流程
## 1. 环境安装
CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
### 其他相关依赖
gcc >= 4.8.5,
python >= 3.8
## 2. 功能测试
在 repo 的 tests/unit/cli 中运行:
```shell
source path.sh
bash test_cli.sh
```
## 3. 预期结果
输出 "Test success",且运行过程中无报错或 Error 即为成功。

@ -6,7 +6,7 @@ import kaldiio
import numpy import numpy
from distutils.util import strtobool from distutils.util import strtobool
from paddlespeech.s2t.transform.cmvn import CMVN from paddlespeech.audio.transform.cmvn import CMVN
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -5,7 +5,7 @@ import logging
import kaldiio import kaldiio
import numpy as np import numpy as np
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -4,7 +4,7 @@ import logging
from distutils.util import strtobool from distutils.util import strtobool
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -3,7 +3,7 @@ import argparse
import logging import logging
import sys import sys
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

Loading…
Cancel
Save