rope for streaming decoding

pull/3407/head
Hui Zhang 12 months ago
parent b56fb85ca0
commit 0a5cc5556e

@ -4,7 +4,10 @@
paddle version: 2.5.0 paddle version: 2.5.0
paddlespeech version: 1.5.0 paddlespeech version: 1.5.0
Need set `decoding.decoding_chunk_size=16` when decoding. Tesla V100-SXM2-32GB: 1 node, 4 card
Global BachSize: 32 * 4
Training Done: 1 day, 12:56:39.639646
### `decoding.decoding_chunk_size=16`
> chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms > chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms
@ -15,15 +18,14 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | |
| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | |
## Conformer ### `decoding.decoding_chunk_size=-1`
paddle version: 2.2.2
paddlespeech version: 1.0.1 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| --- | --- | --- | --- | --- | --- | --- | --- | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - | 5.51 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 | - | 4.99 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 |
## Conformer Streaming ## Conformer Streaming
@ -39,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 |
## Conformer
paddle version: 2.2.2
paddlespeech version: 1.0.1
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 |
## Transformer ## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |

@ -15,8 +15,8 @@
# Modified from wenet(https://github.com/wenet-e2e/wenet) # Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Multi-Head Attention layer definition.""" """Multi-Head Attention layer definition."""
import math import math
from typing import Tuple
from typing import List from typing import List
from typing import Tuple
import paddle import paddle
from paddle import nn from paddle import nn
@ -428,7 +428,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
# (B,H,T,D) # (B,H,T,D)
ndim = tensors[0].dim() ndim = tensors[0].dim()
_,H,T,D = tensors[0].shape _, H, T, D = tensors[0].shape
# sinusoidal shape same with tensors[0] # sinusoidal shape same with tensors[0]
# [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H) # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H)
@ -476,6 +476,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
where `cache_t == chunk_size * num_decoding_left_chunks` where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size` and `head * d_k == size`
""" """
q, k, v = self.forward_qkv(query, key, value) q, k, v = self.forward_qkv(query, key, value)
# q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k)
@ -504,7 +505,12 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
new_cache = paddle.concat((k, v), axis=-1) new_cache = paddle.concat((k, v), axis=-1)
# f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index
q, k = self.apply_rotary_position_embeddings(pos_emb, q, k) # q_t always is chunk_size
q_t = q.shape[2]
q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q)
# k will increase when in streaming decoding.
k = self.apply_rotary_position_embeddings(pos_emb, k)
# dot(q, k) # dot(q, k)
scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask), new_cache return self.forward_attention(v, scores, mask), new_cache

@ -164,6 +164,7 @@ class RelPositionalEncoding(PositionalEncoding):
assert offset + x.shape[ assert offset + x.shape[
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len) offset, x.shape[1], self.max_len)
x = x * self.xscale x = x * self.xscale
pos_emb = self.pe[:, offset:offset + x.shape[1]] pos_emb = self.pe[:, offset:offset + x.shape[1]]
return self.dropout(x), self.dropout(pos_emb) return self.dropout(x), self.dropout(pos_emb)

@ -233,7 +233,7 @@ class BaseEncoder(nn.Layer):
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
# before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) xs, _, _ = self.embed(xs, tmp_masks, offset=offset)
# after embed, xs=(B=1, chunk_size, hidden-dim) # after embed, xs=(B=1, chunk_size, hidden-dim)
elayers, _, cache_t1, _ = att_cache.shape elayers, _, cache_t1, _ = att_cache.shape

Loading…
Cancel
Save