diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py index ec24be511..686de9363 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py @@ -28,91 +28,6 @@ from paddlespeech.vector.training.seeding import seed_everything logger = Log(__name__).getlog() -class VectorWrapper: - """ VectorWrapper extract the audio embedding, - and single audio will get only an embedding - """ - def __init__(self, - device, - config_path, - model_path,): - super(VectorWrapper, self).__init__() - # stage 0: config the - self.device = device - self.config_path = config_path - self.model_path = model_path - - # stage 1: set the run host device - paddle.device.set_device(device) - - # stage 2: read the yaml config and set the seed factor - self.read_yaml_config(self.config_path) - seed_everything(self.config.seed) - - # stage 3: init the speaker verification model - self.init_vector_model(self.config, self.model_path) - - def read_yaml_config(self, config_path): - """Read the yaml config from the config path - - Args: - config_path (str): yaml config path - """ - config = CfgNode(new_allowed=True) - - if config_path: - config.merge_from_file(config_path) - - config.freeze() - self.config = config - - def init_vector_model(self, config, model_path): - """Init the vector model from yaml config - - Args: - config (CfgNode): yaml config - model_path (str): pretrained model path and the stored model is named as model.pdparams - """ - # get the backbone network instance - ecapa_tdnn = EcapaTdnn(**config.model) - - # get the sid instance - model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers) - - # read the model parameters to sid model - model_path = os.path.abspath(os.path.expanduser(model_path)) - state_dict = paddle.load(os.path.join(model_path, "model.pdparams")) - model.set_state_dict(state_dict) - - model.eval() - self.model = model - - def extract_audio_embedding(self, audio_path): - """Extract the audio embedding - - Args: - audio_path (str): audio path, which will be extracted the embedding - - Returns: - embedding (numpy.array) : audio embedding - """ - waveform, sr = load_audio(audio_path) - feat = melspectrogram(x=waveform, - sr=self.config.sr, - n_mels=self.config.n_mels, - window_size=self.config.window_size, - hop_length=self.config.hop_size) - # conver the audio feat to batch shape, which means batch_size is equal to one - feat = paddle.to_tensor(feat).unsqueeze(0) - - # in inference period, the lengths is all one without padding - lengths = paddle.ones([1]) - feat = feature_normalize(feat, mean_norm=True, std_norm=False) - - # model backbone network forward the feats and get the embedding - embedding = self.model.backbone(feat, lengths).squeeze().numpy() # (1, emb_size, 1) -> (emb_size) - - return embedding def extract_audio_embedding(args, config): # stage 0: set the training device, cpu or gpu @@ -168,7 +83,7 @@ def extract_audio_embedding(args, config): # stage 5: do global norm with external mean and std rtf = elapsed_time / audio_length logger.info(f"{args.device} rft={rtf}") - paddle.save(embedding, "emb1") + return embedding @@ -177,7 +92,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], - default="gpu", + default="cpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--config", default=None, @@ -202,10 +117,3 @@ if __name__ == "__main__": print(config) extract_audio_embedding(args, config) - - # use the VectorWrapper to extract the audio embedding - vector_inst = VectorWrapper(device="gpu", - config_path=args.config, - model_path=args.load_checkpoint) - - embedding = vector_inst.extract_audio_embedding(args.audio_path) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index fc3b8248d..f2437eaf5 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -1,49 +1,58 @@ #!/bin/bash set -e -# Audio classification -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav -paddlespeech cls --input ./cat.wav --topk 10 - -# Punctuation_restoration -paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 - -# Speech_recognition -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -paddlespeech asr --input ./zh.wav -paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav - -# Text To Speech -paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 -paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 -paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." -paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." -paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 -paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 -paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." - - -# Speech Translation (only support linux) -paddlespeech st --input ./en.wav - - -# batch process -echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts - -# shell pipeline -paddlespeech asr --input ./zh.wav | paddlespeech text --task punc - -# stats -paddlespeech stats --task asr -paddlespeech stats --task tts -paddlespeech stats --task cls +# # Audio classification +# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +# paddlespeech cls --input ./cat.wav --topk 10 + +# # Punctuation_restoration +# paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + +# # Speech_recognition +# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +# paddlespeech asr --input ./zh.wav +# paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + +# # Text To Speech +# paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 +# paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 +# paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +# paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +# paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 +# paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 +# paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +# paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." + + +# # Speech Translation (only support linux) +# paddlespeech st --input ./en.wav + + +# # batch process +# echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + +# # shell pipeline +# paddlespeech asr --input ./zh.wav | paddlespeech text --task punc + +# # stats +# paddlespeech stats --task asr +# paddlespeech stats --task tts +# paddlespeech stats --task cls # Speaker Verification wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav paddlespeech vector --task spk --input 85236145389.wav + +echo "demo 85236145389.wav" > vec.job +paddlespeech vector --task spk --input vec.job + +echo "demo 85236145389.wav" | paddlespeech vector --task spk +rm 85236145389.wav +rm vec.job + +