From 4e7106d9e2a3eb9ee5ab870dcae3a3c59eac338e Mon Sep 17 00:00:00 2001 From: 0x45f Date: Wed, 27 Jul 2022 09:32:11 +0000 Subject: [PATCH] Support dy2st --- paddlespeech/s2t/exps/u2/model.py | 165 +++++++++++++++++- paddlespeech/s2t/models/u2/u2.py | 42 ++++- .../engine/asr/online/python/asr_engine.py | 17 +- 3 files changed, 210 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index cdad3b8f..b41f320b 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -471,6 +471,165 @@ class U2Tester(U2Trainer): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) infer_model.eval() - static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - logger.info(f"Export code: {static_model.forward.code}") - paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) + # logger.info(f"Export code: {static_model.forward.code}") + # paddle.jit.save(static_model, self.args.export_path) + + # # to check outputs + # def flatten(out): + # if isinstance(out, paddle.Tensor): + # return [out] + + # flatten_out = [] + # for var in out: + # if isinstance(var, (list, tuple)): + # flatten_out.extend(flatten(var)) + # else: + # flatten_out.append(var) + # return flatten_out + + + # ######################### infer_model.forward_attention_decoder ######################## + # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') + # b = paddle.full(shape=[10], fill_value=8, dtype='int64') + # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') + # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') + + # out1 = infer_model.forward_attention_decoder(a, b, c) + # print(out1) + + # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # out2 = static_model(a, b, c) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + # ######################### infer_model.forward_encoder_chunk ######################## + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([80], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') + # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') + # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) + # # print(zero_out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # # out2 = static_model(xs, offset, att_cache, cnn_cache) + # # print(out2) + # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) + + # # out1 = flatten(out1) + # # out2 = flatten(out2) + # # for i in range(len(out1)): + # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + # zero_out1 = flatten(zero_out1) + # zero_out2 = flatten(zero_out2) + # for i in range(len(zero_out1)): + # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) + + + + + + + + # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = static_model(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + + ###################### save/load combine ######################## + paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + + + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 76f698e6..9148c737 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,6 +59,20 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() +# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), +# paddle.static.InputSpec(shape=[None], dtype='int64'), +# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + +# input_spec2 = [ +# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), +# paddle.static.InputSpec(shape=[1], dtype='int32'), +# -16, +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + +# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), +# paddle.static.InputSpec(shape=[1], dtype='int64')] + class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -599,7 +613,12 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), + -16, + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -655,7 +674,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -918,6 +940,9 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64')]) def forward(self, feats, feats_lengths, @@ -933,9 +958,10 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ - return self.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) + # return self.ctc_greedy_search( + # feats, + # feats_lengths, + # decoding_chunk_size=decoding_chunk_size, + # num_decoding_left_chunks=num_decoding_left_chunks, + # simulate_streaming=simulate_streaming) + return feats, feats_lengths diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 4df38f09..cd50f157 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,6 +80,10 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() + from paddle.jit.layer import Layer + self.jit_layer = Layer() + self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) + def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" @@ -474,9 +478,16 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, self.offset, required_cache_size, - self.att_cache, self.cnn_cache) + # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + # chunk_xs, self.offset, required_cache_size, + # self.att_cache, self.cnn_cache) + + (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( + chunk_xs, + paddle.to_tensor([self.offset], dtype='int32'), + self.att_cache, + self.cnn_cache) + outputs.append(y) # update the global offset, in decoding frame unit