diff --git a/demos/speech_server/cls_client.sh b/demos/speech_server/cls_client.sh new file mode 100644 index 000000000..5797aa204 --- /dev/null +++ b/demos/speech_server/cls_client.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --topk 1 diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 32e1ec4de..ffe721b8f 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -69,7 +69,7 @@ GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/ Model Type | Dataset| Example Link | Pretrained Models | Static Models :-------------:| :------------:| :-----: | :-----: | :-----: -PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) | [panns_cnn6_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz), [panns_cnn10_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz), [panns_cnn14_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz) +PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) | [panns_cnn6_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz)(18M), [panns_cnn10_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz)(19M), [panns_cnn14_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz)(289M) PANN | ESC-50 |[pann-esc50](../../examples/esc50/cls0)|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz) ## Punctuation Restoration Models diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 939779b74..c13aadeb3 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -70,13 +70,10 @@ class TTSClientExecutor(BaseExecutor): choices=[0, 8000, 16000], help='Sampling rate, the default is the same as the model') self.parser.add_argument( - '--output', - type=str, - default="./output.wav", - help='Synthesized audio file') + '--output', type=str, default=None, help='Synthesized audio file') - def postprocess(self, response_dict: dict, outfile: str) -> float: - wav_base64 = response_dict["result"]["audio"] + def postprocess(self, wav_base64: str, outfile: str) -> float: + #wav_base64 = response_dict["result"]["audio"] audio_data_byte = base64.b64decode(wav_base64) # from byte samples, sample_rate = soundfile.read( @@ -93,37 +90,38 @@ class TTSClientExecutor(BaseExecutor): else: logger.error("The format for saving audio only supports wav or pcm") - duration = len(samples) / sample_rate - return duration - def execute(self, argv: List[str]) -> bool: args = self.parser.parse_args(argv) - try: - url = 'http://' + args.server_ip + ":" + str( - args.port) + '/paddlespeech/tts' - request = { - "text": args.input, - "spk_id": args.spk_id, - "speed": args.speed, - "volume": args.volume, - "sample_rate": args.sample_rate, - "save_path": args.output - } - st = time.time() - response = requests.post(url, json.dumps(request)) - time_consume = time.time() - st - - response_dict = response.json() - duration = self.postprocess(response_dict, args.output) + input_ = args.input + server_ip = args.server_ip + port = args.port + spk_id = args.spk_id + speed = args.speed + volume = args.volume + sample_rate = args.sample_rate + output = args.output + try: + time_start = time.time() + res = self( + input=input_, + server_ip=server_ip, + port=port, + spk_id=spk_id, + speed=speed, + volume=volume, + sample_rate=sample_rate, + output=output) + time_end = time.time() + time_consume = time_end - time_start + response_dict = res.json() logger.info(response_dict["message"]) - logger.info("Save synthesized audio successfully on %s." % - (args.output)) - logger.info("Audio duration: %f s." % (duration)) + logger.info("Save synthesized audio successfully on %s." % (output)) + logger.info("Audio duration: %f s." % + (response_dict['result']['duration'])) logger.info("Response time: %f s." % (time_consume)) - return True - except BaseException: + except Exception as e: logger.error("Failed to synthesized audio.") return False @@ -136,7 +134,7 @@ class TTSClientExecutor(BaseExecutor): speed: float=1.0, volume: float=1.0, sample_rate: int=0, - output: str="./output.wav"): + output: str=None): """ Python API to call an executor. """ @@ -151,20 +149,11 @@ class TTSClientExecutor(BaseExecutor): "save_path": output } - try: - st = time.time() - response = requests.post(url, json.dumps(request)) - time_consume = time.time() - st - response_dict = response.json() - duration = self.postprocess(response_dict, output) - - print(response_dict["message"]) - print("Save synthesized audio successfully on %s." % (output)) - print("Audio duration: %f s." % (duration)) - print("Response time: %f s." % (time_consume)) - print("RTF: %f " % (time_consume / duration)) - except BaseException: - print("Failed to synthesized audio.") + res = requests.post(url, json.dumps(request)) + response_dict = res.json() + if not output: + self.postprocess(response_dict["result"]["audio"], output) + return res @cli_client_register( @@ -193,24 +182,27 @@ class ASRClientExecutor(BaseExecutor): def execute(self, argv: List[str]) -> bool: args = self.parser.parse_args(argv) - url = 'http://' + args.server_ip + ":" + str( - args.port) + '/paddlespeech/asr' - audio = wav2base64(args.input) - data = { - "audio": audio, - "audio_format": args.audio_format, - "sample_rate": args.sample_rate, - "lang": args.lang, - } - time_start = time.time() + input_ = args.input + server_ip = args.server_ip + port = args.port + sample_rate = args.sample_rate + lang = args.lang + audio_format = args.audio_format + try: - r = requests.post(url=url, data=json.dumps(data)) - # ending Timestamp + time_start = time.time() + res = self( + input=input_, + server_ip=server_ip, + port=port, + sample_rate=sample_rate, + lang=lang, + audio_format=audio_format) time_end = time.time() - logger.info(r.json()) - logger.info("time cost %f s." % (time_end - time_start)) + logger.info(res.json()) + logger.info("Response time %f s." % (time_end - time_start)) return True - except BaseException: + except Exception as e: logger.error("Failed to speech recognition.") return False @@ -234,15 +226,9 @@ class ASRClientExecutor(BaseExecutor): "sample_rate": sample_rate, "lang": lang, } - time_start = time.time() - try: - r = requests.post(url=url, data=json.dumps(data)) - # ending Timestamp - time_end = time.time() - print(r.json()) - print("time cost %f s." % (time_end - time_start)) - except BaseException: - print("Failed to speech recognition.") + + res = requests.post(url=url, data=json.dumps(data)) + return res @cli_client_register( @@ -270,22 +256,19 @@ class CLSClientExecutor(BaseExecutor): def execute(self, argv: List[str]) -> bool: args = self.parser.parse_args(argv) - url = 'http://' + args.server_ip + ":" + str( - args.port) + '/paddlespeech/cls' - audio = wav2base64(args.input) - data = { - "audio": audio, - "topk": args.topk, - } - time_start = time.time() + input_ = args.input + server_ip = args.server_ip + port = args.port + topk = args.topk + try: - r = requests.post(url=url, data=json.dumps(data)) - # ending Timestamp + time_start = time.time() + res = self(input=input_, server_ip=server_ip, port=port, topk=topk) time_end = time.time() - logger.info(r.json()) + logger.info(res.json()) logger.info("Response time %f s." % (time_end - time_start)) return True - except BaseException: + except Exception as e: logger.error("Failed to speech classification.") return False @@ -302,12 +285,6 @@ class CLSClientExecutor(BaseExecutor): url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/cls' audio = wav2base64(input) data = {"audio": audio, "topk": topk} - time_start = time.time() - try: - r = requests.post(url=url, data=json.dumps(data)) - # ending Timestamp - time_end = time.time() - print(r.json()) - print("Response time %f s." % (time_end - time_start)) - except BaseException: - print("Failed to speech classification.") + + res = requests.post(url=url, data=json.dumps(data)) + return res diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 6d23fb5be..db8813ba9 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -531,4 +531,4 @@ class TTSEngine(BaseEngine): postprocess_time)) logger.info("RTF: {}".format(rtf)) - return lang, target_sample_rate, wav_base64 + return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index 8d6c7fd17..f153f60b9 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -250,4 +250,4 @@ class TTSEngine(BaseEngine): logger.info("RTF: {}".format(rtf)) logger.info("device: {}".format(self.device)) - return lang, target_sample_rate, wav_base64 + return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index 9b647a63c..ce3f34a77 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -54,10 +54,11 @@ class ASRResponse(BaseModel): #****************************************************************************************/ class TTSResult(BaseModel): lang: str = "zh" - sample_rate: int spk_id: int = 0 speed: float = 1.0 volume: float = 1.0 + sample_rate: int + duration: float save_path: str = None audio: str diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index 0af0f6d07..4e9bbe23e 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -98,7 +98,7 @@ def tts(request_body: TTSRequest): tts_engine = engine_pool['tts'] logger.info("Get tts engine successfully.") - lang, target_sample_rate, wav_base64 = tts_engine.run( + lang, target_sample_rate, duration, wav_base64 = tts_engine.run( text, spk_id, speed, volume, sample_rate, save_path) response = { @@ -113,6 +113,7 @@ def tts(request_body: TTSRequest): "speed": speed, "volume": volume, "sample_rate": target_sample_rate, + "duration": duration, "save_path": save_path, "audio": wav_base64 }