PaddleSpeech/demos/echo_hub/hub_infer.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import librosa
import paddle
import paddlehub as hub
import soundfile as sf

# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable

if __name__ == '__main__':
    paddle.set_device(args.device)

    output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
    if args.lang == 'zh':
        t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
        s2t_model = hub.Module(name='u2_conformer_aishell')
    else:
        t2s_model = hub.Module(
            name='fastspeech2_ljspeech', output_dir=output_dir)
        s2t_model = hub.Module(name='u2_conformer_librispeech')

    if isinstance(args.text, list):
        args.text = ' '.join(args.text)

    wavs = t2s_model.generate([args.text], device=args.device)
    print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
    # convert sr to 16k
    x, sr = librosa.load(wavs[0])
    y = librosa.resample(x, sr, 16000)
    wav_16k = wavs[0].replace('.wav', '_16k.wav')
    sf.write(wav_16k, y, 16000)
    print('[S2T]Resample to 16k: {}'.format(wav_16k))
    text = s2t_model.speech_recognize(wav_16k)
    print('[S2T]Text recognized from wav file: {}'.format(text))
Add echo_hub, asr_hub and tts_hub demos. 3 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import argparse`
			`import os`

			`import librosa`
			`import paddle`
			`import paddlehub as hub`
			`import soundfile as sf`

			`# yapf: disable`
			`parser = argparse.ArgumentParser(__doc__)`
			`parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])`
			`parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])`
			`parser.add_argument("--text", type=str, nargs='+')`
			`parser.add_argument("--output_dir", type=str)`
			`args = parser.parse_args()`
			`# yapf: enable`

			`if __name__ == '__main__':`
			`paddle.set_device(args.device)`

			`output_dir = os.path.abspath(os.path.expanduser(args.output_dir))`
			`if args.lang == 'zh':`
			`t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)`
			`s2t_model = hub.Module(name='u2_conformer_aishell')`
			`else:`
			`t2s_model = hub.Module(`
			`name='fastspeech2_ljspeech', output_dir=output_dir)`
			`s2t_model = hub.Module(name='u2_conformer_librispeech')`

			`if isinstance(args.text, list):`
			`args.text = ' '.join(args.text)`

			`wavs = t2s_model.generate([args.text], device=args.device)`
			`print('[T2S]Wav file has been generated: {}'.format(wavs[0]))`
			`# convert sr to 16k`
			`x, sr = librosa.load(wavs[0])`
			`y = librosa.resample(x, sr, 16000)`
			`wav_16k = wavs[0].replace('.wav', '_16k.wav')`
			`sf.write(wav_16k, y, 16000)`
			`print('[S2T]Resample to 16k: {}'.format(wav_16k))`
			`text = s2t_model.speech_recognize(wav_16k)`
			`print('[S2T]Text recognized from wav file: {}'.format(text))`