fix some bug and complete the recog.py

pull/930/head
huangyuxin 3 years ago
parent e4a9328c40
commit 418d85ef73

@ -28,8 +28,8 @@ from .utils import add_results_to_json
from deepspeech.exps import dynamic_import_tester
from deepspeech.io.reader import LoadInputsAndTargets
from deepspeech.models.asr_interface import ASRInterface
from deepspeech.utils.log import Log
from deepspeech.models.lm.transformer import TransformerLM
from deepspeech.utils.log import Log
# from espnet.asr.asr_utils import get_model_conf
# from espnet.asr.asr_utils import torch_load
# from espnet.nets.lm_interface import dynamic_import_lm
@ -80,8 +80,7 @@ def recog_v2(args):
sort_in_input_length=False,
preprocess_conf=confs.collator.augmentation_config
if args.preprocess_conf is None else args.preprocess_conf,
preprocess_args={"train": False},
)
preprocess_args={"train": False}, )
if args.rnnlm:
lm_path = args.rnnlm
@ -120,8 +119,7 @@ def recog_v2(args):
ctc=args.ctc_weight,
lm=args.lm_weight,
ngram=args.ngram_weight,
length_bonus=args.penalty,
)
length_bonus=args.penalty, )
beam_search = BeamSearch(
beam_size=args.beam_size,
vocab_size=len(char_list),
@ -130,8 +128,7 @@ def recog_v2(args):
sos=model.sos,
eos=model.eos,
token_list=char_list,
pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
)
pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )
# TODO(karita): make all scorers batchfied
if args.batchsize == 1:
@ -178,9 +175,10 @@ def recog_v2(args):
logger.info(f'feat: {feat.shape}')
enc = model.encode(paddle.to_tensor(feat).to(dtype))
logger.info(f'eout: {enc.shape}')
nbest_hyps = beam_search(x=enc,
maxlenratio=args.maxlenratio,
minlenratio=args.minlenratio)
nbest_hyps = beam_search(
x=enc,
maxlenratio=args.maxlenratio,
minlenratio=args.minlenratio)
nbest_hyps = [
h.asdict()
for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
@ -190,9 +188,8 @@ def recog_v2(args):
item = new_js[name]['output'][0] # 1-best
ref = item['text']
rec_text = item['rec_text'].replace('',
' ').replace('<eos>',
'').strip()
rec_text = item['rec_text'].replace('', ' ').replace(
'<eos>', '').strip()
rec_tokenid = list(map(int, item['rec_tokenid'].split()))
f.write({
"utt": name,

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import Any
from typing import List
from typing import Tuple
@ -20,12 +21,12 @@ import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from deepspeech.modules.mask import subsequent_mask
from deepspeech.modules.encoder import TransformerEncoder
from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
from deepspeech.models.lm_interface import LMInterface
from deepspeech.modules.encoder import TransformerEncoder
from deepspeech.modules.mask import subsequent_mask
import logging
class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def __init__(
self,
@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
unit: int=1024,
layer: int=4,
dropout_rate: float=0.5,
emb_dropout_rate: float = 0.0,
att_dropout_rate: float = 0.0,
tie_weights: bool = False,):
emb_dropout_rate: float=0.0,
att_dropout_rate: float=0.0,
tie_weights: bool=False, ):
nn.Layer.__init__(self)
if pos_enc == "sinusoidal":
@ -84,16 +85,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
), "Tie Weights: True need embedding and final dimensions to match"
self.decoder.weight = self.embed.weight
def _target_mask(self, ys_in_pad):
ys_mask = ys_in_pad != 0
m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
return ys_mask.unsqueeze(-2) & m
def forward(
self, x: paddle.Tensor, t: paddle.Tensor
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
def forward(self, x: paddle.Tensor, t: paddle.Tensor
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute LM loss value from buffer sequences.
Args:
@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb = self.embed(x)
h, _ = self.encoder(emb, xlen)
y = self.decoder(h)
loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
loss = F.cross_entropy(
y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
mask = xm.to(dtype=loss.dtype)
logp = loss * mask.view(-1)
logp = logp.sum()
@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb = self.embed(y)
h, _, cache = self.encoder.forward_one_step(
emb, self._target_mask(y), cache=state
)
emb, self._target_mask(y), cache=state)
h = self.decoder(h[:, -1])
logp = F.log_softmax(h).squeeze(0)
return logp, cache
# batch beam search API (see BatchScorerInterface)
def batch_score(
self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor
) -> Tuple[paddle.Tensor, List[Any]]:
def batch_score(self,
ys: paddle.Tensor,
states: List[Any],
xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
"""Score new token batch (required).
Args:
@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
# batch decoding
h, _, states = self.encoder.forward_one_step(
emb, self._target_mask(ys), cache=batch_state
)
emb, self._target_mask(ys), cache=batch_state)
h = self.decoder(h[:, -1])
logp = F.log_softmax(h)
# transpose state of [layer, batch] into [batch, layer]
state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)]
state_list = [[states[i][b] for i in range(n_layers)]
for b in range(n_batch)]
return logp, state_list
@ -214,17 +213,17 @@ if __name__ == "__main__":
layer=16,
dropout_rate=0.5, )
# n_vocab: int,
# pos_enc: str=None,
# embed_unit: int=128,
# att_unit: int=256,
# head: int=2,
# unit: int=1024,
# layer: int=4,
# dropout_rate: float=0.5,
# emb_dropout_rate: float = 0.0,
# att_dropout_rate: float = 0.0,
# tie_weights: bool = False,):
# n_vocab: int,
# pos_enc: str=None,
# embed_unit: int=128,
# att_unit: int=256,
# head: int=2,
# unit: int=1024,
# layer: int=4,
# dropout_rate: float=0.5,
# emb_dropout_rate: float = 0.0,
# att_dropout_rate: float = 0.0,
# tie_weights: bool = False,):
paddle.set_device("cpu")
model_dict = paddle.load("transformerLM.pdparams")
tlm.set_state_dict(model_dict)
@ -236,11 +235,11 @@ if __name__ == "__main__":
state = None
output, state = tlm.score(input2, state, None)
input3 = np.array([5,10])
input3 = np.array([5, 10])
input3 = paddle.to_tensor(input3)
output, state = tlm.score(input3, state, None)
input4 = np.array([5,10,0])
input4 = np.array([5, 10, 0])
input4 = paddle.to_tensor(input4)
output, state = tlm.score(input4, state, None)
print("output", output)

@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation
from deepspeech.modules.attention import MultiHeadedAttention
from deepspeech.modules.attention import RelPositionMultiHeadedAttention
from deepspeech.modules.conformer_convolution import ConvolutionModule
from deepspeech.modules.embedding import NoPositionalEncoding
from deepspeech.modules.embedding import PositionalEncoding
from deepspeech.modules.embedding import RelPositionalEncoding
from deepspeech.modules.embedding import NoPositionalEncoding
from deepspeech.modules.encoder_layer import ConformerEncoderLayer
from deepspeech.modules.encoder_layer import TransformerEncoderLayer
from deepspeech.modules.mask import add_optional_chunk_mask
@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder):
self,
xs: paddle.Tensor,
masks: paddle.Tensor,
cache=None,
) -> Tuple[paddle.Tensor, paddle.Tensor]:
cache=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Encode input frame.
Args:
@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder):
if isinstance(self.embed, Conv2dSubsampling):
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
xs, pos_emb, masks = self.embed(
xs, masks.astype(xs.dtype), offset=0)
else:
xs , pos_emb, masks= self.embed(xs, masks.astype(xs.dtype), offset=0)
xs, pos_emb, masks = self.embed(
xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool)

Loading…
Cancel
Save