From fd7431372881d495e74c037af57b430abb655182 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 14 Jul 2023 07:37:55 +0000 Subject: [PATCH] rope for streaming decoding --- examples/aishell/asr1/RESULTS.md | 33 +++++++++++++++++++-------- paddlespeech/s2t/modules/attention.py | 12 +++++++--- paddlespeech/s2t/modules/embedding.py | 1 + paddlespeech/s2t/modules/encoder.py | 2 +- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md index e7a425bf5..87eed7b9d 100644 --- a/examples/aishell/asr1/RESULTS.md +++ b/examples/aishell/asr1/RESULTS.md @@ -4,7 +4,10 @@ paddle version: 2.5.0 paddlespeech version: 1.5.0 -Need set `decoding.decoding_chunk_size=16` when decoding. +Tesla V100-SXM2-32GB: 1 node, 4 card +Global BachSize: 32 * 4 +Training Done: 1 day, 12:56:39.639646 +### `decoding.decoding_chunk_size=16` > chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms @@ -15,15 +18,14 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | | -## Conformer -paddle version: 2.2.2 -paddlespeech version: 1.0.1 -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | +### `decoding.decoding_chunk_size=-1` + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - | 5.51 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 | - | 4.99 | ## Conformer Streaming @@ -39,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 | +## Conformer +paddle version: 2.2.2 +paddlespeech version: 1.0.1 +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | + + ## Transformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 2ab931d64..548564a25 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -15,8 +15,8 @@ # Modified from wenet(https://github.com/wenet-e2e/wenet) """Multi-Head Attention layer definition.""" import math -from typing import Tuple from typing import List +from typing import Tuple import paddle from paddle import nn @@ -428,7 +428,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): # (B,H,T,D) ndim = tensors[0].dim() - _,H,T,D = tensors[0].shape + _, H, T, D = tensors[0].shape # sinusoidal shape same with tensors[0] # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H) @@ -476,6 +476,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): where `cache_t == chunk_size * num_decoding_left_chunks` and `head * d_k == size` """ + q, k, v = self.forward_qkv(query, key, value) # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) @@ -504,7 +505,12 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): new_cache = paddle.concat((k, v), axis=-1) # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index - q, k = self.apply_rotary_position_embeddings(pos_emb, q, k) + # q_t always is chunk_size + q_t = q.shape[2] + q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q) + # k will increase when in streaming decoding. + k = self.apply_rotary_position_embeddings(pos_emb, k) + # dot(q, k) scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) return self.forward_attention(v, scores, mask), new_cache diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 0274fa1a3..1e9f01018 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -164,6 +164,7 @@ class RelPositionalEncoding(PositionalEncoding): assert offset + x.shape[ 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) + x = x * self.xscale pos_emb = self.pe[:, offset:offset + x.shape[1]] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 91247d977..27d7ffbd7 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -233,7 +233,7 @@ class BaseEncoder(nn.Layer): xs = self.global_cmvn(xs) # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) + xs, _, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) elayers, _, cache_t1, _ = att_cache.shape