commit
919c8d0607
@ -0,0 +1,15 @@
|
|||||||
|
FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
|
||||||
|
LABEL maintainer="paddlesl@baidu.com"
|
||||||
|
|
||||||
|
RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
|
||||||
|
RUN pip3 uninstall mccabe -y ; exit 0;
|
||||||
|
RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4
|
||||||
|
|
||||||
|
RUN cd /home/PaddleSpeech/audio
|
||||||
|
RUN python setup.py bdist_wheel
|
||||||
|
|
||||||
|
RUN cd /home/PaddleSpeech
|
||||||
|
RUN python setup.py bdist_wheel
|
||||||
|
RUN pip install audio/dist/*.whl dist/*.whl
|
||||||
|
|
||||||
|
WORKDIR /home/PaddleSpeech/
|
@ -1,36 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"type": "speed",
|
|
||||||
"params": {
|
|
||||||
"min_speed_rate": 0.9,
|
|
||||||
"max_speed_rate": 1.1,
|
|
||||||
"num_rates": 3
|
|
||||||
},
|
|
||||||
"prob": 0.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "shift",
|
|
||||||
"params": {
|
|
||||||
"min_shift_ms": -5,
|
|
||||||
"max_shift_ms": 5
|
|
||||||
},
|
|
||||||
"prob": 1.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "specaug",
|
|
||||||
"params": {
|
|
||||||
"W": 0,
|
|
||||||
"warp_mode": "PIL",
|
|
||||||
"F": 10,
|
|
||||||
"n_freq_masks": 2,
|
|
||||||
"T": 50,
|
|
||||||
"n_time_masks": 2,
|
|
||||||
"p": 1.0,
|
|
||||||
"adaptive_number_ratio": 0,
|
|
||||||
"adaptive_size_ratio": 0,
|
|
||||||
"max_n_time_masks": 20,
|
|
||||||
"replace_with_zero": true
|
|
||||||
},
|
|
||||||
"prob": 1.0
|
|
||||||
}
|
|
||||||
]
|
|
@ -0,0 +1,25 @@
|
|||||||
|
process:
|
||||||
|
# extract kaldi fbank from PCM
|
||||||
|
- type: fbank_kaldi
|
||||||
|
fs: 16000
|
||||||
|
n_mels: 161
|
||||||
|
n_shift: 160
|
||||||
|
win_length: 400
|
||||||
|
dither: 0.1
|
||||||
|
- type: cmvn_json
|
||||||
|
cmvn_path: data/mean_std.json
|
||||||
|
# these three processes are a.k.a. SpecAugument
|
||||||
|
- type: time_warp
|
||||||
|
max_time_warp: 5
|
||||||
|
inplace: true
|
||||||
|
mode: PIL
|
||||||
|
- type: freq_mask
|
||||||
|
F: 30
|
||||||
|
n_mask: 2
|
||||||
|
inplace: true
|
||||||
|
replace_with_zero: false
|
||||||
|
- type: time_mask
|
||||||
|
T: 40
|
||||||
|
n_mask: 2
|
||||||
|
inplace: true
|
||||||
|
replace_with_zero: false
|
@ -1,36 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"type": "speed",
|
|
||||||
"params": {
|
|
||||||
"min_speed_rate": 0.9,
|
|
||||||
"max_speed_rate": 1.1,
|
|
||||||
"num_rates": 3
|
|
||||||
},
|
|
||||||
"prob": 0.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "shift",
|
|
||||||
"params": {
|
|
||||||
"min_shift_ms": -5,
|
|
||||||
"max_shift_ms": 5
|
|
||||||
},
|
|
||||||
"prob": 1.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "specaug",
|
|
||||||
"params": {
|
|
||||||
"W": 0,
|
|
||||||
"warp_mode": "PIL",
|
|
||||||
"F": 10,
|
|
||||||
"n_freq_masks": 2,
|
|
||||||
"T": 50,
|
|
||||||
"n_time_masks": 2,
|
|
||||||
"p": 1.0,
|
|
||||||
"adaptive_number_ratio": 0,
|
|
||||||
"adaptive_size_ratio": 0,
|
|
||||||
"max_n_time_masks": 20,
|
|
||||||
"replace_with_zero": true
|
|
||||||
},
|
|
||||||
"prob": 1.0
|
|
||||||
}
|
|
||||||
]
|
|
@ -0,0 +1,25 @@
|
|||||||
|
process:
|
||||||
|
# extract kaldi fbank from PCM
|
||||||
|
- type: fbank_kaldi
|
||||||
|
fs: 16000
|
||||||
|
n_mels: 161
|
||||||
|
n_shift: 160
|
||||||
|
win_length: 400
|
||||||
|
dither: 0.1
|
||||||
|
- type: cmvn_json
|
||||||
|
cmvn_path: data/mean_std.json
|
||||||
|
# these three processes are a.k.a. SpecAugument
|
||||||
|
- type: time_warp
|
||||||
|
max_time_warp: 5
|
||||||
|
inplace: true
|
||||||
|
mode: PIL
|
||||||
|
- type: freq_mask
|
||||||
|
F: 30
|
||||||
|
n_mask: 2
|
||||||
|
inplace: true
|
||||||
|
replace_with_zero: false
|
||||||
|
- type: time_mask
|
||||||
|
T: 40
|
||||||
|
n_mask: 2
|
||||||
|
inplace: true
|
||||||
|
replace_with_zero: false
|
@ -1,36 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"type": "speed",
|
|
||||||
"params": {
|
|
||||||
"min_speed_rate": 0.9,
|
|
||||||
"max_speed_rate": 1.1,
|
|
||||||
"num_rates": 3
|
|
||||||
},
|
|
||||||
"prob": 0.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "shift",
|
|
||||||
"params": {
|
|
||||||
"min_shift_ms": -5,
|
|
||||||
"max_shift_ms": 5
|
|
||||||
},
|
|
||||||
"prob": 1.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "specaug",
|
|
||||||
"params": {
|
|
||||||
"W": 5,
|
|
||||||
"warp_mode": "PIL",
|
|
||||||
"F": 30,
|
|
||||||
"n_freq_masks": 2,
|
|
||||||
"T": 40,
|
|
||||||
"n_time_masks": 2,
|
|
||||||
"p": 1.0,
|
|
||||||
"adaptive_number_ratio": 0,
|
|
||||||
"adaptive_size_ratio": 0,
|
|
||||||
"max_n_time_masks": 20,
|
|
||||||
"replace_with_zero": true
|
|
||||||
},
|
|
||||||
"prob": 1.0
|
|
||||||
}
|
|
||||||
]
|
|
@ -0,0 +1,25 @@
|
|||||||
|
process:
|
||||||
|
# extract kaldi fbank from PCM
|
||||||
|
- type: fbank_kaldi
|
||||||
|
fs: 16000
|
||||||
|
n_mels: 161
|
||||||
|
n_shift: 160
|
||||||
|
win_length: 400
|
||||||
|
dither: 0.1
|
||||||
|
- type: cmvn_json
|
||||||
|
cmvn_path: data/mean_std.json
|
||||||
|
# these three processes are a.k.a. SpecAugument
|
||||||
|
- type: time_warp
|
||||||
|
max_time_warp: 5
|
||||||
|
inplace: true
|
||||||
|
mode: PIL
|
||||||
|
- type: freq_mask
|
||||||
|
F: 30
|
||||||
|
n_mask: 2
|
||||||
|
inplace: true
|
||||||
|
replace_with_zero: false
|
||||||
|
- type: time_mask
|
||||||
|
T: 40
|
||||||
|
n_mask: 2
|
||||||
|
inplace: true
|
||||||
|
replace_with_zero: false
|
@ -1,315 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import math
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
from paddle.nn import initializer as I
|
|
||||||
|
|
||||||
from paddlespeech.s2t.modules.activation import brelu
|
|
||||||
from paddlespeech.s2t.modules.mask import make_non_pad_mask
|
|
||||||
from paddlespeech.s2t.utils.log import Log
|
|
||||||
|
|
||||||
logger = Log(__name__).getlog()
|
|
||||||
|
|
||||||
__all__ = ['RNNStack']
|
|
||||||
|
|
||||||
|
|
||||||
class RNNCell(nn.RNNCellBase):
|
|
||||||
r"""
|
|
||||||
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
|
||||||
computes the outputs and updates states.
|
|
||||||
The formula used is as follows:
|
|
||||||
.. math::
|
|
||||||
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
|
|
||||||
y_{t} & = h_{t}
|
|
||||||
|
|
||||||
where :math:`act` is for :attr:`activation`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
hidden_size: int,
|
|
||||||
activation="tanh",
|
|
||||||
weight_ih_attr=None,
|
|
||||||
weight_hh_attr=None,
|
|
||||||
bias_ih_attr=None,
|
|
||||||
bias_hh_attr=None,
|
|
||||||
name=None):
|
|
||||||
super().__init__()
|
|
||||||
std = 1.0 / math.sqrt(hidden_size)
|
|
||||||
self.weight_hh = self.create_parameter(
|
|
||||||
(hidden_size, hidden_size),
|
|
||||||
weight_hh_attr,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
self.bias_ih = None
|
|
||||||
self.bias_hh = self.create_parameter(
|
|
||||||
(hidden_size, ),
|
|
||||||
bias_hh_attr,
|
|
||||||
is_bias=True,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
if activation not in ["tanh", "relu", "brelu"]:
|
|
||||||
raise ValueError(
|
|
||||||
"activation for SimpleRNNCell should be tanh or relu, "
|
|
||||||
"but get {}".format(activation))
|
|
||||||
self.activation = activation
|
|
||||||
self._activation_fn = paddle.tanh \
|
|
||||||
if activation == "tanh" \
|
|
||||||
else F.relu
|
|
||||||
if activation == 'brelu':
|
|
||||||
self._activation_fn = brelu
|
|
||||||
|
|
||||||
def forward(self, inputs, states=None):
|
|
||||||
if states is None:
|
|
||||||
states = self.get_initial_states(inputs, self.state_shape)
|
|
||||||
pre_h = states
|
|
||||||
i2h = inputs
|
|
||||||
if self.bias_ih is not None:
|
|
||||||
i2h += self.bias_ih
|
|
||||||
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
|
||||||
if self.bias_hh is not None:
|
|
||||||
h2h += self.bias_hh
|
|
||||||
h = self._activation_fn(i2h + h2h)
|
|
||||||
return h, h
|
|
||||||
|
|
||||||
@property
|
|
||||||
def state_shape(self):
|
|
||||||
return (self.hidden_size, )
|
|
||||||
|
|
||||||
|
|
||||||
class GRUCell(nn.RNNCellBase):
|
|
||||||
r"""
|
|
||||||
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
|
||||||
it computes the outputs and updates states.
|
|
||||||
The formula for GRU used is as follows:
|
|
||||||
.. math::
|
|
||||||
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
|
|
||||||
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
|
|
||||||
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
|
|
||||||
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
|
|
||||||
y_{t} & = h_{t}
|
|
||||||
|
|
||||||
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
|
|
||||||
multiplication operator.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
input_size: int,
|
|
||||||
hidden_size: int,
|
|
||||||
weight_ih_attr=None,
|
|
||||||
weight_hh_attr=None,
|
|
||||||
bias_ih_attr=None,
|
|
||||||
bias_hh_attr=None,
|
|
||||||
name=None):
|
|
||||||
super().__init__()
|
|
||||||
std = 1.0 / math.sqrt(hidden_size)
|
|
||||||
self.weight_hh = self.create_parameter(
|
|
||||||
(3 * hidden_size, hidden_size),
|
|
||||||
weight_hh_attr,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
self.bias_ih = None
|
|
||||||
self.bias_hh = self.create_parameter(
|
|
||||||
(3 * hidden_size, ),
|
|
||||||
bias_hh_attr,
|
|
||||||
is_bias=True,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.input_size = input_size
|
|
||||||
self._gate_activation = F.sigmoid
|
|
||||||
self._activation = paddle.tanh
|
|
||||||
|
|
||||||
def forward(self, inputs, states=None):
|
|
||||||
if states is None:
|
|
||||||
states = self.get_initial_states(inputs, self.state_shape)
|
|
||||||
|
|
||||||
pre_hidden = states
|
|
||||||
x_gates = inputs
|
|
||||||
if self.bias_ih is not None:
|
|
||||||
x_gates = x_gates + self.bias_ih
|
|
||||||
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
|
||||||
if self.bias_hh is not None:
|
|
||||||
h_gates = h_gates + self.bias_hh
|
|
||||||
|
|
||||||
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
|
|
||||||
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
|
|
||||||
|
|
||||||
r = self._gate_activation(x_r + h_r)
|
|
||||||
z = self._gate_activation(x_z + h_z)
|
|
||||||
c = self._activation(x_c + r * h_c) # apply reset gate after mm
|
|
||||||
h = (pre_hidden - c) * z + c
|
|
||||||
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
|
|
||||||
|
|
||||||
return h, h
|
|
||||||
|
|
||||||
@property
|
|
||||||
def state_shape(self):
|
|
||||||
r"""
|
|
||||||
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
|
||||||
size would be automatically inserted into shape). The shape corresponds
|
|
||||||
to the shape of :math:`h_{t-1}`.
|
|
||||||
"""
|
|
||||||
return (self.hidden_size, )
|
|
||||||
|
|
||||||
|
|
||||||
class BiRNNWithBN(nn.Layer):
|
|
||||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
|
||||||
The batch normalization is only performed on input-state weights.
|
|
||||||
|
|
||||||
:param size: Dimension of RNN cells.
|
|
||||||
:type size: int
|
|
||||||
:param share_weights: Whether to share input-hidden weights between
|
|
||||||
forward and backward directional RNNs.
|
|
||||||
:type share_weights: bool
|
|
||||||
:return: Bidirectional simple rnn layer.
|
|
||||||
:rtype: Variable
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, i_size: int, h_size: int, share_weights: bool):
|
|
||||||
super().__init__()
|
|
||||||
self.share_weights = share_weights
|
|
||||||
if self.share_weights:
|
|
||||||
#input-hidden weights shared between bi-directional rnn.
|
|
||||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
|
||||||
# batch norm is only performed on input-state projection
|
|
||||||
self.fw_bn = nn.BatchNorm1D(
|
|
||||||
h_size, bias_attr=None, data_format='NLC')
|
|
||||||
self.bw_fc = self.fw_fc
|
|
||||||
self.bw_bn = self.fw_bn
|
|
||||||
else:
|
|
||||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
|
||||||
self.fw_bn = nn.BatchNorm1D(
|
|
||||||
h_size, bias_attr=None, data_format='NLC')
|
|
||||||
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
|
||||||
self.bw_bn = nn.BatchNorm1D(
|
|
||||||
h_size, bias_attr=None, data_format='NLC')
|
|
||||||
|
|
||||||
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
|
||||||
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
|
||||||
self.fw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
|
||||||
self.bw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
|
||||||
|
|
||||||
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
|
|
||||||
# x, shape [B, T, D]
|
|
||||||
fw_x = self.fw_bn(self.fw_fc(x))
|
|
||||||
bw_x = self.bw_bn(self.bw_fc(x))
|
|
||||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
|
||||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
|
||||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class BiGRUWithBN(nn.Layer):
|
|
||||||
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
|
||||||
The batch normalization is only performed on input-state weights.
|
|
||||||
|
|
||||||
:param name: Name of the layer.
|
|
||||||
:type name: string
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: Variable
|
|
||||||
:param size: Dimension of GRU cells.
|
|
||||||
:type size: int
|
|
||||||
:param act: Activation type.
|
|
||||||
:type act: string
|
|
||||||
:return: Bidirectional GRU layer.
|
|
||||||
:rtype: Variable
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, i_size: int, h_size: int):
|
|
||||||
super().__init__()
|
|
||||||
hidden_size = h_size * 3
|
|
||||||
|
|
||||||
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
|
||||||
self.fw_bn = nn.BatchNorm1D(
|
|
||||||
hidden_size, bias_attr=None, data_format='NLC')
|
|
||||||
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
|
||||||
self.bw_bn = nn.BatchNorm1D(
|
|
||||||
hidden_size, bias_attr=None, data_format='NLC')
|
|
||||||
|
|
||||||
self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
|
||||||
self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
|
||||||
self.fw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
|
||||||
self.bw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
|
||||||
|
|
||||||
def forward(self, x, x_len):
|
|
||||||
# x, shape [B, T, D]
|
|
||||||
fw_x = self.fw_bn(self.fw_fc(x))
|
|
||||||
bw_x = self.bw_bn(self.bw_fc(x))
|
|
||||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
|
||||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
|
||||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class RNNStack(nn.Layer):
|
|
||||||
"""RNN group with stacked bidirectional simple RNN or GRU layers.
|
|
||||||
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: Variable
|
|
||||||
:param size: Dimension of RNN cells in each layer.
|
|
||||||
:type size: int
|
|
||||||
:param num_stacks: Number of stacked rnn layers.
|
|
||||||
:type num_stacks: int
|
|
||||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
|
||||||
:type use_gru: bool
|
|
||||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
|
||||||
forward and backward directional RNNs.
|
|
||||||
It is only available when use_gru=False.
|
|
||||||
:type share_weights: bool
|
|
||||||
:return: Output layer of the RNN group.
|
|
||||||
:rtype: Variable
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
i_size: int,
|
|
||||||
h_size: int,
|
|
||||||
num_stacks: int,
|
|
||||||
use_gru: bool,
|
|
||||||
share_rnn_weights: bool):
|
|
||||||
super().__init__()
|
|
||||||
rnn_stacks = []
|
|
||||||
for i in range(num_stacks):
|
|
||||||
if use_gru:
|
|
||||||
#default:GRU using tanh
|
|
||||||
rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
|
|
||||||
else:
|
|
||||||
rnn_stacks.append(
|
|
||||||
BiRNNWithBN(
|
|
||||||
i_size=i_size,
|
|
||||||
h_size=h_size,
|
|
||||||
share_weights=share_rnn_weights))
|
|
||||||
i_size = h_size * 2
|
|
||||||
|
|
||||||
self.rnn_stacks = nn.LayerList(rnn_stacks)
|
|
||||||
|
|
||||||
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
|
|
||||||
"""
|
|
||||||
x: shape [B, T, D]
|
|
||||||
x_len: shpae [B]
|
|
||||||
"""
|
|
||||||
for i, rnn in enumerate(self.rnn_stacks):
|
|
||||||
x, x_len = rnn(x, x_len)
|
|
||||||
masks = make_non_pad_mask(x_len) #[B, T]
|
|
||||||
masks = masks.unsqueeze(-1) # [B, T, 1]
|
|
||||||
# TODO(Hui Zhang): not support bool multiply
|
|
||||||
masks = masks.astype(x.dtype)
|
|
||||||
x = x.multiply(masks)
|
|
||||||
|
|
||||||
return x, x_len
|
|
@ -1,31 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from .deepspeech2 import DeepSpeech2InferModelOnline
|
|
||||||
from .deepspeech2 import DeepSpeech2ModelOnline
|
|
||||||
from paddlespeech.s2t.utils import dynamic_pip_install
|
|
||||||
import sys
|
|
||||||
|
|
||||||
try:
|
|
||||||
import paddlespeech_ctcdecoders
|
|
||||||
except ImportError:
|
|
||||||
try:
|
|
||||||
package_name = 'paddlespeech_ctcdecoders'
|
|
||||||
if sys.platform != "win32":
|
|
||||||
dynamic_pip_install.install(package_name)
|
|
||||||
except Exception:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Can not install package paddlespeech_ctcdecoders on your system. \
|
|
||||||
The DeepSpeech2 model is not supported for your system")
|
|
||||||
|
|
||||||
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
|
|
@ -1,33 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import paddle
|
|
||||||
|
|
||||||
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
|
|
||||||
|
|
||||||
|
|
||||||
class Conv2dSubsampling4Online(Conv2dSubsampling4):
|
|
||||||
def __init__(self, idim: int, odim: int, dropout_rate: float):
|
|
||||||
super().__init__(idim, odim, dropout_rate, None)
|
|
||||||
self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
|
|
||||||
self.receptive_field_length = 2 * (
|
|
||||||
3 - 1) + 3 # stride_1 * (kernel_size_2 - 1) + kerel_size_1
|
|
||||||
|
|
||||||
def forward(self, x: paddle.Tensor,
|
|
||||||
x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
|
|
||||||
x = x.unsqueeze(1) # (b, c=1, t, f)
|
|
||||||
x = self.conv(x)
|
|
||||||
#b, c, t, f = paddle.shape(x) #not work under jit
|
|
||||||
x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
|
|
||||||
x_len = ((x_len - 1) // 2 - 1) // 2
|
|
||||||
return x, x_len
|
|
@ -1,397 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""Deepspeech2 ASR Online Model"""
|
|
||||||
import paddle
|
|
||||||
import paddle.nn.functional as F
|
|
||||||
from paddle import nn
|
|
||||||
|
|
||||||
from paddlespeech.s2t.models.ds2_online.conv import Conv2dSubsampling4Online
|
|
||||||
from paddlespeech.s2t.modules.ctc import CTCDecoder
|
|
||||||
from paddlespeech.s2t.utils import layer_tools
|
|
||||||
from paddlespeech.s2t.utils.checkpoint import Checkpoint
|
|
||||||
from paddlespeech.s2t.utils.log import Log
|
|
||||||
logger = Log(__name__).getlog()
|
|
||||||
|
|
||||||
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
|
|
||||||
|
|
||||||
|
|
||||||
class CRNNEncoder(nn.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
feat_size,
|
|
||||||
dict_size,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=4,
|
|
||||||
rnn_size=1024,
|
|
||||||
rnn_direction='forward',
|
|
||||||
num_fc_layers=2,
|
|
||||||
fc_layers_size_list=[512, 256],
|
|
||||||
use_gru=False):
|
|
||||||
super().__init__()
|
|
||||||
self.rnn_size = rnn_size
|
|
||||||
self.feat_size = feat_size # 161 for linear
|
|
||||||
self.dict_size = dict_size
|
|
||||||
self.num_rnn_layers = num_rnn_layers
|
|
||||||
self.num_fc_layers = num_fc_layers
|
|
||||||
self.rnn_direction = rnn_direction
|
|
||||||
self.fc_layers_size_list = fc_layers_size_list
|
|
||||||
self.use_gru = use_gru
|
|
||||||
self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
|
|
||||||
|
|
||||||
self.output_dim = self.conv.output_dim
|
|
||||||
|
|
||||||
i_size = self.conv.output_dim
|
|
||||||
self.rnn = nn.LayerList()
|
|
||||||
self.layernorm_list = nn.LayerList()
|
|
||||||
self.fc_layers_list = nn.LayerList()
|
|
||||||
if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
|
|
||||||
layernorm_size = 2 * rnn_size
|
|
||||||
elif rnn_direction == 'forward':
|
|
||||||
layernorm_size = rnn_size
|
|
||||||
else:
|
|
||||||
raise Exception("Wrong rnn direction")
|
|
||||||
for i in range(0, num_rnn_layers):
|
|
||||||
if i == 0:
|
|
||||||
rnn_input_size = i_size
|
|
||||||
else:
|
|
||||||
rnn_input_size = layernorm_size
|
|
||||||
if use_gru is True:
|
|
||||||
self.rnn.append(
|
|
||||||
nn.GRU(
|
|
||||||
input_size=rnn_input_size,
|
|
||||||
hidden_size=rnn_size,
|
|
||||||
num_layers=1,
|
|
||||||
direction=rnn_direction))
|
|
||||||
else:
|
|
||||||
self.rnn.append(
|
|
||||||
nn.LSTM(
|
|
||||||
input_size=rnn_input_size,
|
|
||||||
hidden_size=rnn_size,
|
|
||||||
num_layers=1,
|
|
||||||
direction=rnn_direction))
|
|
||||||
self.layernorm_list.append(nn.LayerNorm(layernorm_size))
|
|
||||||
self.output_dim = layernorm_size
|
|
||||||
|
|
||||||
fc_input_size = layernorm_size
|
|
||||||
for i in range(self.num_fc_layers):
|
|
||||||
self.fc_layers_list.append(
|
|
||||||
nn.Linear(fc_input_size, fc_layers_size_list[i]))
|
|
||||||
fc_input_size = fc_layers_size_list[i]
|
|
||||||
self.output_dim = fc_layers_size_list[i]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def output_size(self):
|
|
||||||
return self.output_dim
|
|
||||||
|
|
||||||
def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
|
|
||||||
"""Compute Encoder outputs
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x (Tensor): [B, T, D]
|
|
||||||
x_lens (Tensor): [B]
|
|
||||||
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
|
|
||||||
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
|
|
||||||
Return:
|
|
||||||
x (Tensor): encoder outputs, [B, T, D]
|
|
||||||
x_lens (Tensor): encoder length, [B]
|
|
||||||
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
|
|
||||||
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
|
|
||||||
"""
|
|
||||||
if init_state_h_box is not None:
|
|
||||||
init_state_list = None
|
|
||||||
|
|
||||||
if self.use_gru is True:
|
|
||||||
init_state_h_list = paddle.split(
|
|
||||||
init_state_h_box, self.num_rnn_layers, axis=0)
|
|
||||||
init_state_list = init_state_h_list
|
|
||||||
else:
|
|
||||||
init_state_h_list = paddle.split(
|
|
||||||
init_state_h_box, self.num_rnn_layers, axis=0)
|
|
||||||
init_state_c_list = paddle.split(
|
|
||||||
init_state_c_box, self.num_rnn_layers, axis=0)
|
|
||||||
init_state_list = [(init_state_h_list[i], init_state_c_list[i])
|
|
||||||
for i in range(self.num_rnn_layers)]
|
|
||||||
else:
|
|
||||||
init_state_list = [None] * self.num_rnn_layers
|
|
||||||
|
|
||||||
x, x_lens = self.conv(x, x_lens)
|
|
||||||
final_chunk_state_list = []
|
|
||||||
for i in range(0, self.num_rnn_layers):
|
|
||||||
x, final_state = self.rnn[i](x, init_state_list[i],
|
|
||||||
x_lens) #[B, T, D]
|
|
||||||
final_chunk_state_list.append(final_state)
|
|
||||||
x = self.layernorm_list[i](x)
|
|
||||||
|
|
||||||
for i in range(self.num_fc_layers):
|
|
||||||
x = self.fc_layers_list[i](x)
|
|
||||||
x = F.relu(x)
|
|
||||||
|
|
||||||
if self.use_gru is True:
|
|
||||||
final_chunk_state_h_box = paddle.concat(
|
|
||||||
final_chunk_state_list, axis=0)
|
|
||||||
final_chunk_state_c_box = init_state_c_box
|
|
||||||
else:
|
|
||||||
final_chunk_state_h_list = [
|
|
||||||
final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
|
|
||||||
]
|
|
||||||
final_chunk_state_c_list = [
|
|
||||||
final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
|
|
||||||
]
|
|
||||||
final_chunk_state_h_box = paddle.concat(
|
|
||||||
final_chunk_state_h_list, axis=0)
|
|
||||||
final_chunk_state_c_box = paddle.concat(
|
|
||||||
final_chunk_state_c_list, axis=0)
|
|
||||||
|
|
||||||
return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
|
|
||||||
|
|
||||||
def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
|
|
||||||
"""Compute Encoder outputs
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x (Tensor): [B, T, D]
|
|
||||||
x_lens (Tensor): [B]
|
|
||||||
decoder_chunk_size: The chunk size of decoder
|
|
||||||
Returns:
|
|
||||||
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
|
|
||||||
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
|
|
||||||
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
|
|
||||||
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
|
|
||||||
"""
|
|
||||||
subsampling_rate = self.conv.subsampling_rate
|
|
||||||
receptive_field_length = self.conv.receptive_field_length
|
|
||||||
chunk_size = (decoder_chunk_size - 1
|
|
||||||
) * subsampling_rate + receptive_field_length
|
|
||||||
chunk_stride = subsampling_rate * decoder_chunk_size
|
|
||||||
max_len = x.shape[1]
|
|
||||||
assert (chunk_size <= max_len)
|
|
||||||
|
|
||||||
eouts_chunk_list = []
|
|
||||||
eouts_chunk_lens_list = []
|
|
||||||
if (max_len - chunk_size) % chunk_stride != 0:
|
|
||||||
padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
|
|
||||||
else:
|
|
||||||
padding_len = 0
|
|
||||||
padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
|
|
||||||
padded_x = paddle.concat([x, padding], axis=1)
|
|
||||||
num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
|
|
||||||
num_chunk = int(num_chunk)
|
|
||||||
chunk_state_h_box = None
|
|
||||||
chunk_state_c_box = None
|
|
||||||
final_state_h_box = None
|
|
||||||
final_state_c_box = None
|
|
||||||
for i in range(0, num_chunk):
|
|
||||||
start = i * chunk_stride
|
|
||||||
end = start + chunk_size
|
|
||||||
x_chunk = padded_x[:, start:end, :]
|
|
||||||
|
|
||||||
x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
|
|
||||||
paddle.zeros_like(x_lens),
|
|
||||||
x_lens - i * chunk_stride)
|
|
||||||
x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
|
|
||||||
x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
|
|
||||||
x_len_left, x_chunk_len_tmp)
|
|
||||||
|
|
||||||
eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
|
|
||||||
x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
|
|
||||||
|
|
||||||
eouts_chunk_list.append(eouts_chunk)
|
|
||||||
eouts_chunk_lens_list.append(eouts_chunk_lens)
|
|
||||||
final_state_h_box = chunk_state_h_box
|
|
||||||
final_state_c_box = chunk_state_c_box
|
|
||||||
return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2ModelOnline(nn.Layer):
|
|
||||||
"""The DeepSpeech2 network structure for online.
|
|
||||||
|
|
||||||
:param audio: Audio spectrogram data layer.
|
|
||||||
:type audio: Variable
|
|
||||||
:param text: Transcription text data layer.
|
|
||||||
:type text: Variable
|
|
||||||
:param audio_len: Valid sequence length data layer.
|
|
||||||
:type audio_len: Variable
|
|
||||||
:param feat_size: feature size for audio.
|
|
||||||
:type feat_size: int
|
|
||||||
:param dict_size: Dictionary size for tokenized transcription.
|
|
||||||
:type dict_size: int
|
|
||||||
:param num_conv_layers: Number of stacking convolution layers.
|
|
||||||
:type num_conv_layers: int
|
|
||||||
:param num_rnn_layers: Number of stacking RNN layers.
|
|
||||||
:type num_rnn_layers: int
|
|
||||||
:param rnn_size: RNN layer size (dimension of RNN cells).
|
|
||||||
:type rnn_size: int
|
|
||||||
:param num_fc_layers: Number of stacking FC layers.
|
|
||||||
:type num_fc_layers: int
|
|
||||||
:param fc_layers_size_list: The list of FC layer sizes.
|
|
||||||
:type fc_layers_size_list: [int,]
|
|
||||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
|
||||||
:type use_gru: bool
|
|
||||||
:return: A tuple of an output unnormalized log probability layer (
|
|
||||||
before softmax) and a ctc cost layer.
|
|
||||||
:rtype: tuple of LayerOutput
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
feat_size,
|
|
||||||
dict_size,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=4,
|
|
||||||
rnn_size=1024,
|
|
||||||
rnn_direction='forward',
|
|
||||||
num_fc_layers=2,
|
|
||||||
fc_layers_size_list=[512, 256],
|
|
||||||
use_gru=False,
|
|
||||||
blank_id=0,
|
|
||||||
ctc_grad_norm_type=None, ):
|
|
||||||
super().__init__()
|
|
||||||
self.encoder = CRNNEncoder(
|
|
||||||
feat_size=feat_size,
|
|
||||||
dict_size=dict_size,
|
|
||||||
num_conv_layers=num_conv_layers,
|
|
||||||
num_rnn_layers=num_rnn_layers,
|
|
||||||
rnn_direction=rnn_direction,
|
|
||||||
num_fc_layers=num_fc_layers,
|
|
||||||
fc_layers_size_list=fc_layers_size_list,
|
|
||||||
rnn_size=rnn_size,
|
|
||||||
use_gru=use_gru)
|
|
||||||
|
|
||||||
self.decoder = CTCDecoder(
|
|
||||||
odim=dict_size, # <blank> is in vocab
|
|
||||||
enc_n_units=self.encoder.output_size,
|
|
||||||
blank_id=blank_id,
|
|
||||||
dropout_rate=0.0,
|
|
||||||
reduction=True, # sum
|
|
||||||
batch_average=True, # sum / batch_size
|
|
||||||
grad_norm_type=ctc_grad_norm_type)
|
|
||||||
|
|
||||||
def forward(self, audio, audio_len, text, text_len):
|
|
||||||
"""Compute Model loss
|
|
||||||
|
|
||||||
Args:
|
|
||||||
audio (Tensor): [B, T, D]
|
|
||||||
audio_len (Tensor): [B]
|
|
||||||
text (Tensor): [B, U]
|
|
||||||
text_len (Tensor): [B]
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
loss (Tensor): [1]
|
|
||||||
"""
|
|
||||||
eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
|
|
||||||
audio, audio_len, None, None)
|
|
||||||
loss = self.decoder(eouts, eouts_len, text, text_len)
|
|
||||||
return loss
|
|
||||||
|
|
||||||
@paddle.no_grad()
|
|
||||||
def decode(self, audio, audio_len):
|
|
||||||
# decoders only accept string encoded in utf-8
|
|
||||||
# Make sure the decoder has been initialized
|
|
||||||
eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
|
|
||||||
audio, audio_len, None, None)
|
|
||||||
probs = self.decoder.softmax(eouts)
|
|
||||||
batch_size = probs.shape[0]
|
|
||||||
self.decoder.reset_decoder(batch_size=batch_size)
|
|
||||||
self.decoder.next(probs, eouts_len)
|
|
||||||
trans_best, trans_beam = self.decoder.decode()
|
|
||||||
return trans_best
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, dataloader, config, checkpoint_path):
|
|
||||||
"""Build a DeepSpeech2Model model from a pretrained model.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataloader: paddle.io.DataLoader
|
|
||||||
|
|
||||||
config: yacs.config.CfgNode
|
|
||||||
model configs
|
|
||||||
|
|
||||||
checkpoint_path: Path or str
|
|
||||||
the path of pretrained model checkpoint, without extension name
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
DeepSpeech2ModelOnline
|
|
||||||
The model built from pretrained result.
|
|
||||||
"""
|
|
||||||
model = cls(
|
|
||||||
feat_size=dataloader.collate_fn.feature_size,
|
|
||||||
dict_size=dataloader.collate_fn.vocab_size,
|
|
||||||
num_conv_layers=config.num_conv_layers,
|
|
||||||
num_rnn_layers=config.num_rnn_layers,
|
|
||||||
rnn_size=config.rnn_layer_size,
|
|
||||||
rnn_direction=config.rnn_direction,
|
|
||||||
num_fc_layers=config.num_fc_layers,
|
|
||||||
fc_layers_size_list=config.fc_layers_size_list,
|
|
||||||
use_gru=config.use_gru,
|
|
||||||
blank_id=config.blank_id,
|
|
||||||
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
|
|
||||||
infos = Checkpoint().load_parameters(
|
|
||||||
model, checkpoint_path=checkpoint_path)
|
|
||||||
logger.info(f"checkpoint info: {infos}")
|
|
||||||
layer_tools.summary(model)
|
|
||||||
return model
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_config(cls, config):
|
|
||||||
"""Build a DeepSpeec2ModelOnline from config
|
|
||||||
Parameters
|
|
||||||
|
|
||||||
config: yacs.config.CfgNode
|
|
||||||
config
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
DeepSpeech2ModelOnline
|
|
||||||
The model built from config.
|
|
||||||
"""
|
|
||||||
model = cls(
|
|
||||||
feat_size=config.input_dim,
|
|
||||||
dict_size=config.output_dim,
|
|
||||||
num_conv_layers=config.num_conv_layers,
|
|
||||||
num_rnn_layers=config.num_rnn_layers,
|
|
||||||
rnn_size=config.rnn_layer_size,
|
|
||||||
rnn_direction=config.rnn_direction,
|
|
||||||
num_fc_layers=config.num_fc_layers,
|
|
||||||
fc_layers_size_list=config.fc_layers_size_list,
|
|
||||||
use_gru=config.use_gru,
|
|
||||||
blank_id=config.blank_id,
|
|
||||||
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
|
|
||||||
chunk_state_c_box):
|
|
||||||
eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
|
|
||||||
audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
|
|
||||||
probs_chunk = self.decoder.softmax(eouts_chunk)
|
|
||||||
return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
|
|
||||||
|
|
||||||
def export(self):
|
|
||||||
static_model = paddle.jit.to_static(
|
|
||||||
self,
|
|
||||||
input_spec=[
|
|
||||||
paddle.static.InputSpec(
|
|
||||||
shape=[None, None,
|
|
||||||
self.encoder.feat_size], #[B, chunk_size, feat_dim]
|
|
||||||
dtype='float32'),
|
|
||||||
paddle.static.InputSpec(shape=[None],
|
|
||||||
dtype='int64'), # audio_length, [B]
|
|
||||||
paddle.static.InputSpec(
|
|
||||||
shape=[None, None, None], dtype='float32'),
|
|
||||||
paddle.static.InputSpec(
|
|
||||||
shape=[None, None, None], dtype='float32')
|
|
||||||
])
|
|
||||||
return static_model
|
|
@ -0,0 +1,54 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "frontend_itf.h"
|
||||||
|
#include "kaldi/feat/feature-window.h"
|
||||||
|
|
||||||
|
namespace ppspeech {
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
class StreamingFeatureTpl : public FrontendInterface {
|
||||||
|
public:
|
||||||
|
typedef typename F::Options Options;
|
||||||
|
StreamingFeatureTpl(const Options& opts,
|
||||||
|
std::unique_ptr<FrontendInterface> base_extractor);
|
||||||
|
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
|
||||||
|
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
|
||||||
|
|
||||||
|
// the dim_ is the dim of single frame feature
|
||||||
|
virtual size_t Dim() const { return computer_.Dim(); }
|
||||||
|
|
||||||
|
virtual void SetFinished() { base_extractor_->SetFinished(); }
|
||||||
|
|
||||||
|
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
|
||||||
|
|
||||||
|
virtual void Reset() {
|
||||||
|
base_extractor_->Reset();
|
||||||
|
remained_wav_.Resize(0);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
|
||||||
|
kaldi::Vector<kaldi::BaseFloat>* feats);
|
||||||
|
Options opts_;
|
||||||
|
std::unique_ptr<FrontendInterface> base_extractor_;
|
||||||
|
kaldi::FeatureWindowFunction window_function_;
|
||||||
|
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
|
||||||
|
F computer_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace ppspeech
|
||||||
|
|
||||||
|
#include "frontend/audio/feature_common_inl.h"
|
@ -0,0 +1,95 @@
|
|||||||
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
namespace ppspeech {
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts,
|
||||||
|
std::unique_ptr<FrontendInterface> base_extractor):
|
||||||
|
opts_(opts),
|
||||||
|
computer_(opts),
|
||||||
|
window_function_(opts.frame_opts) {
|
||||||
|
base_extractor_ = std::move(base_extractor);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
|
||||||
|
base_extractor_->Accept(waves);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class F>
|
||||||
|
bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
|
||||||
|
kaldi::Vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
|
||||||
|
bool flag = base_extractor_->Read(&wav);
|
||||||
|
if (flag == false || wav.Dim() == 0) return false;
|
||||||
|
|
||||||
|
// append remaned waves
|
||||||
|
int32 wav_len = wav.Dim();
|
||||||
|
int32 left_len = remained_wav_.Dim();
|
||||||
|
kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
|
||||||
|
waves.Range(0, left_len).CopyFromVec(remained_wav_);
|
||||||
|
waves.Range(left_len, wav_len).CopyFromVec(wav);
|
||||||
|
|
||||||
|
// compute speech feature
|
||||||
|
Compute(waves, feats);
|
||||||
|
|
||||||
|
// cache remaned waves
|
||||||
|
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
|
||||||
|
int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
|
||||||
|
int32 frame_shift = frame_opts.WindowShift();
|
||||||
|
int32 left_samples = waves.Dim() - frame_shift * num_frames;
|
||||||
|
remained_wav_.Resize(left_samples);
|
||||||
|
remained_wav_.CopyFromVec(
|
||||||
|
waves.Range(frame_shift * num_frames, left_samples));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute feat
|
||||||
|
template <class F>
|
||||||
|
bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
|
||||||
|
kaldi::Vector<kaldi::BaseFloat>* feats) {
|
||||||
|
const kaldi::FrameExtractionOptions& frame_opts =
|
||||||
|
computer_.GetFrameOptions();
|
||||||
|
int32 num_samples = waves.Dim();
|
||||||
|
int32 frame_length = frame_opts.WindowSize();
|
||||||
|
int32 sample_rate = frame_opts.samp_freq;
|
||||||
|
if (num_samples < frame_length) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
|
||||||
|
feats->Resize(num_frames * Dim());
|
||||||
|
|
||||||
|
kaldi::Vector<kaldi::BaseFloat> window;
|
||||||
|
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
|
||||||
|
for (int32 frame = 0; frame < num_frames; frame++) {
|
||||||
|
kaldi::BaseFloat raw_log_energy = 0.0;
|
||||||
|
kaldi::ExtractWindow(0,
|
||||||
|
waves,
|
||||||
|
frame,
|
||||||
|
frame_opts,
|
||||||
|
window_function_,
|
||||||
|
&window,
|
||||||
|
need_raw_log_energy ? &raw_log_energy : NULL);
|
||||||
|
|
||||||
|
kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
|
||||||
|
computer_.Compute(&window, &this_feature);
|
||||||
|
kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
|
||||||
|
output_row.CopyFromVec(this_feature);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace ppspeech
|
@ -1,5 +1,4 @@
|
|||||||
|
|
||||||
add_library(utils
|
add_library(utils
|
||||||
file_utils.cc
|
file_utils.cc
|
||||||
simdjson.cpp
|
)
|
||||||
)
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue