test can run

pull/578/head
Hui Zhang 5 years ago
parent abd2f5891b
commit e5347c48ef

@ -617,10 +617,52 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 58,
"id": "pregnant-modem", "id": "pregnant-modem",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [
"def get_default_args(fn):\n",
" if fn is None:\n",
" return {}\n",
"\n",
" signature = inspect.signature(fn)\n",
" return {\n",
" k: v.default\n",
" for k, v in signature.parameters.items()\n",
" if v.default is not inspect.Parameter.empty\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "light-drill",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'inspect' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-59-8f782d06097f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_default_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-58-cb5a8794f39f>\u001b[0m in \u001b[0;36mget_default_args\u001b[0;34m(fn)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0msignature\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minspect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msignature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m return {\n\u001b[1;32m 7\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefault\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'inspect' is not defined"
]
}
],
"source": [
"get_default_args(io.open)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "protective-belgium",
"metadata": {},
"outputs": [],
"source": [] "source": []
} }
], ],

@ -179,6 +179,11 @@ if not hasattr(paddle.Tensor, 'eq'):
) )
paddle.Tensor.eq = eq paddle.Tensor.eq = eq
if not hasattr(paddle, 'eq'):
logger.warn(
"override eq of paddle if exists or register, remove this when fixed!")
paddle.eq = eq
def contiguous(xs: paddle.Tensor) -> paddle.Tensor: def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
return xs return xs
@ -256,13 +261,14 @@ if not hasattr(paddle.Tensor, 'masked_fill'):
def masked_fill_(xs: paddle.Tensor, def masked_fill_(xs: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
value: Union[float, int]): value: Union[float, int]) -> paddle.Tensor:
assert is_broadcastable(xs.shape, mask.shape) is True assert is_broadcastable(xs.shape, mask.shape) is True
bshape = paddle.broadcast_shape(xs.shape, mask.shape) bshape = paddle.broadcast_shape(xs.shape, mask.shape)
mask = mask.broadcast_to(bshape) mask = mask.broadcast_to(bshape)
trues = paddle.ones_like(xs) * value trues = paddle.ones_like(xs) * value
ret = paddle.where(mask, trues, xs) ret = paddle.where(mask, trues, xs)
paddle.assign(ret.detach(), output=xs) paddle.assign(ret.detach(), output=xs)
return xs
if not hasattr(paddle.Tensor, 'masked_fill_'): if not hasattr(paddle.Tensor, 'masked_fill_'):
@ -271,9 +277,10 @@ if not hasattr(paddle.Tensor, 'masked_fill_'):
paddle.Tensor.masked_fill_ = masked_fill_ paddle.Tensor.masked_fill_ = masked_fill_
def fill_(xs: paddle.Tensor, value: Union[float, int]): def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
val = paddle.full_like(xs, value) val = paddle.full_like(xs, value)
paddle.assign(val.detach(), output=xs) paddle.assign(val.detach(), output=xs)
return xs
if not hasattr(paddle.Tensor, 'fill_'): if not hasattr(paddle.Tensor, 'fill_'):
@ -317,7 +324,7 @@ if not hasattr(paddle.Tensor, 'type_as'):
def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
assert len(args) == 1 assert len(args) == 1
if isinstace(args[0], str): # dtype if isinstance(args[0], str): # dtype
return x.astype(args[0]) return x.astype(args[0])
elif isinstance(args[0], paddle.Tensor): #Tensor elif isinstance(args[0], paddle.Tensor): #Tensor
return x.astype(args[0].dtype) return x.astype(args[0].dtype)
@ -338,6 +345,16 @@ if not hasattr(paddle.Tensor, 'float'):
logger.warn("register user float to paddle.Tensor, remove this when fixed!") logger.warn("register user float to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'float', func_float) setattr(paddle.Tensor, 'float', func_float)
def tolist(x: paddle.Tensor) -> List[Any]:
return x.numpy().tolist()
if not hasattr(paddle.Tensor, 'tolist'):
logger.warn(
"register user tolist to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'tolist', tolist)
########### hcak paddle.nn.functional ############# ########### hcak paddle.nn.functional #############

@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
"""Contains U2 model.""" """Contains U2 model."""
import sys
import time import time
import logging import logging
import numpy as np import numpy as np
@ -256,11 +257,19 @@ class U2Tester(U2Trainer):
cutoff_prob=1.0, # Cutoff probability for pruning. cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning. cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search',
# 'ctc_prefix_beam_search', 'attention_rescoring'
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch=8, # # of CPUs for beam search. num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=500, # Beam search width. beam_size=10, # Beam search width.
batch_size=128, # decoding batch size batch_size=16, # decoding batch size
ctc_weight=0.0, # ctc weight for attention rescoring decode mode.
decoding_chunk_size=-1, # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1.
simulate_streaming=False, # simulate streaming inference. Defaults to False.
)) ))
if config is not None: if config is not None:
@ -279,19 +288,19 @@ class U2Tester(U2Trainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def compute_metrics(self, audio, texts, audio_len, texts_len): def compute_metrics(self, audio, audio_len, texts, texts_len, fout=None):
cfg = self.config.decoding cfg = self.config.decoding
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
vocab_list = self.test_loader.dataset.vocab_list text_feature = self.test_loader.dataset.text_feature
target_transcripts = self.ordid2token(texts, texts_len) target_transcripts = self.ordid2token(texts, texts_len)
result_transcripts = self.model.decode( result_transcripts = self.model.decode(
audio, audio,
audio_len, audio_len,
vocab_list, text_feature=text_feature,
decoding_method=cfg.decoding_method, decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path, lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha, beam_alpha=cfg.alpha,
@ -299,13 +308,19 @@ class U2Tester(U2Trainer):
beam_size=cfg.beam_size, beam_size=cfg.beam_size,
cutoff_prob=cfg.cutoff_prob, cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n, cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch) num_processes=cfg.num_proc_bsearch,
ctc_weight=cfg.ctc_weight,
decoding_chunk_size=cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming)
for target, result in zip(target_transcripts, result_transcripts): for target, result in zip(target_transcripts, result_transcripts):
errors, len_ref = errors_func(target, result) errors, len_ref = errors_func(target, result)
errors_sum += errors errors_sum += errors
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
if fout:
fout.write(result + "\n")
self.logger.info( self.logger.info(
"\nTarget Transcription: %s\nOutput Transcription: %s" % "\nTarget Transcription: %s\nOutput Transcription: %s" %
(target, result)) (target, result))
@ -322,6 +337,7 @@ class U2Tester(U2Trainer):
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
def test(self): def test(self):
assert self.args.result_file
self.model.eval() self.model.eval()
self.logger.info( self.logger.info(
f"Test Total Examples: {len(self.test_loader.dataset)}") f"Test Total Examples: {len(self.test_loader.dataset)}")
@ -329,14 +345,16 @@ class U2Tester(U2Trainer):
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
for i, batch in enumerate(self.test_loader): with open(self.args.result_file, 'w') as fout:
metrics = self.compute_metrics(*batch) for i, batch in enumerate(self.test_loader):
errors_sum += metrics['errors_sum'] metrics = self.compute_metrics(*batch, fout=fout)
len_refs += metrics['len_refs'] errors_sum += metrics['errors_sum']
num_ins += metrics['num_ins'] len_refs += metrics['len_refs']
error_rate_type = metrics['error_rate_type'] num_ins += metrics['num_ins']
self.logger.info("Error rate [%s] (%d/?) = %f" % error_rate_type = metrics['error_rate_type']
(error_rate_type, num_ins, errors_sum / len_refs)) self.logger.info(
"Error rate [%s] (%d/?) = %f" %
(error_rate_type, num_ins, errors_sum / len_refs))
# logging # logging
msg = "Test: " msg = "Test: "
@ -351,24 +369,34 @@ class U2Tester(U2Trainer):
try: try:
self.test() self.test()
except KeyboardInterrupt: except KeyboardInterrupt:
exit(-1) sys.exit(-1)
def export(self): def load_inferspec(self):
"""infer model and input spec.
Returns:
nn.Layer: inference model
List[paddle.static.InputSpec]: input spec.
"""
from deepspeech.models.u2 import U2InferModel from deepspeech.models.u2 import U2InferModel
infer_model = U2InferModel.from_pretrained(self.test_loader.dataset, infer_model = U2InferModel.from_pretrained(self.test_loader.dataset,
self.config.model.clone(), self.config.model.clone(),
self.args.checkpoint_path) self.args.checkpoint_path)
infer_model.eval()
feat_dim = self.test_loader.dataset.feature_size feat_dim = self.test_loader.dataset.feature_size
static_model = paddle.jit.to_static( input_spec = [
infer_model, paddle.static.InputSpec(
input_spec=[ shape=[None, feat_dim, None],
paddle.static.InputSpec( dtype='float32'), # audio, [B,D,T]
shape=[None, feat_dim, None], paddle.static.InputSpec(shape=[None],
dtype='float32'), # audio, [B,D,T] dtype='int64'), # audio_length, [B]
paddle.static.InputSpec(shape=[None], ]
dtype='int64'), # audio_length, [B] return infer_model, input_spec
])
def export(self):
infer_model, input_spec = self.load_inferspec()
assert isinstance(input_spec, list), type(input_spec)
infer_model.eval()
static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
logger.info(f"Export code: {static_model.forward.code}") logger.info(f"Export code: {static_model.forward.code}")
paddle.jit.save(static_model, self.args.export_path) paddle.jit.save(static_model, self.args.export_path)
@ -376,7 +404,7 @@ class U2Tester(U2Trainer):
try: try:
self.export() self.export()
except KeyboardInterrupt: except KeyboardInterrupt:
exit(-1) sys.exit(-1)
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.

@ -107,8 +107,8 @@ class SpeechFeaturizer(object):
def vocab_size(self): def vocab_size(self):
"""Return the vocabulary size. """Return the vocabulary size.
:return: Vocabulary size. Returns:
:rtype: int int: Vocabulary size.
""" """
return self._text_featurizer.vocab_size return self._text_featurizer.vocab_size
@ -116,16 +116,34 @@ class SpeechFeaturizer(object):
def vocab_list(self): def vocab_list(self):
"""Return the vocabulary in list. """Return the vocabulary in list.
:return: Vocabulary in list. Returns:
:rtype: list List[str]:
""" """
return self._text_featurizer.vocab_list return self._text_featurizer.vocab_list
@property
def vocab_dict(self):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]:
"""
return self._text_featurizer.vocab_dict
@property @property
def feature_size(self): def feature_size(self):
"""Return the audio feature size. """Return the audio feature size.
:return: audio feature size. Returns:
:rtype: int int: audio feature size.
""" """
return self._audio_featurizer.feature_size return self._audio_featurizer.feature_size
@property
def text_feature(self):
"""Return the text feature object.
Returns:
TextFeaturizer: object.
"""
return self._text_featurizer

@ -16,6 +16,7 @@
import sentencepiece as spm import sentencepiece as spm
from deepspeech.frontend.utility import UNK from deepspeech.frontend.utility import UNK
from deepspeech.frontend.utility import EOS
class TextFeaturizer(object): class TextFeaturizer(object):
@ -32,10 +33,12 @@ class TextFeaturizer(object):
spm_model_prefix (str, optional): spm model prefix. Defaults to None. spm_model_prefix (str, optional): spm model prefix. Defaults to None.
""" """
assert unit_type in ('char', 'spm', 'word') assert unit_type in ('char', 'spm', 'word')
self.unk = UNK
self.unit_type = unit_type self.unit_type = unit_type
self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( self._vocab_dict, self._id2token, self._vocab_list = self._load_vocabulary_from_file(
vocab_filepath) vocab_filepath)
self.unk = UNK
self.unk_id = self._vocab_list.index(self.unk)
self.eos_id = self._vocab_list.index(EOS)
if unit_type == 'spm': if unit_type == 'spm':
spm_model = spm_model_prefix + '.model' spm_model = spm_model_prefix + '.model'
@ -51,14 +54,23 @@ class TextFeaturizer(object):
tokens = self.spm_tokenize(text) tokens = self.spm_tokenize(text)
return tokens return tokens
def detokenize(self, tokens):
if self.unit_type == 'char':
text = self.char_detokenize(tokens)
elif self.unit_type == 'word':
text = self.word_detokenize(tokens)
else: # spm
text = self.spm_detokenize(tokens)
return text
def featurize(self, text): def featurize(self, text):
"""Convert text string to a list of token indices in char-level.Note """Convert text string to a list of token indices.
that the token indexing order follows the given vocabulary file.
:param text: Text to process. Args:
:type text: str text (str): Text to process.
:return: List of char-level token indices.
:rtype: List[int] Returns:
List[int]: List of token indices.
""" """
tokens = self.tokenize(text) tokens = self.tokenize(text)
ids = [] ids = []
@ -67,6 +79,24 @@ class TextFeaturizer(object):
ids.append(self._vocab_dict[token]) ids.append(self._vocab_dict[token])
return ids return ids
def defeaturize(self, idxs):
"""Convert a list of token indices to text string,
ignore index after eos_id.
Args:
idxs (List[int]): List of token indices.
Returns:
str: Text to process.
"""
tokens = []
for idx in idxs:
if idx == self.eos_id:
break
tokens.append(self._id2token[idx])
text = self.detokenize(tokens)
return text
@property @property
def vocab_size(self): def vocab_size(self):
"""Return the vocabulary size. """Return the vocabulary size.
@ -80,19 +110,50 @@ class TextFeaturizer(object):
def vocab_list(self): def vocab_list(self):
"""Return the vocabulary in list. """Return the vocabulary in list.
:return: Vocabulary in list. Returns:
:rtype: list List[str]: tokens.
""" """
return self._vocab_list return self._vocab_list
@property
def vocab_dict(self):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]: token str -> int
"""
return self._vocab_dict
def char_tokenize(self, text): def char_tokenize(self, text):
"""Character tokenizer.""" """Character tokenizer.
Args:
text (str): text string.
Returns:
List[str]: tokens.
"""
return list(text.strip()) return list(text.strip())
def char_detokenize(self, tokens):
"""Character detokenizer.
Args:
tokens (List[str]): tokens.
Returns:
str: text string.
"""
return "".join(tokens)
def word_tokenize(self, text): def word_tokenize(self, text):
"""Word tokenizer, spearte by <space>.""" """Word tokenizer, separate by <space>."""
return text.strip().split() return text.strip().split()
def word_detokenize(self, tokens):
"""Word detokenizer, separate by <space>."""
return " ".join(tokens)
def spm_tokenize(self, text): def spm_tokenize(self, text):
"""spm tokenize. """spm tokenize.
@ -125,12 +186,34 @@ class TextFeaturizer(object):
enc_line = encode_line(text) enc_line = encode_line(text)
return enc_line return enc_line
def spm_detokenize(self, tokens, input_format='piece'):
"""spm detokenize.
Args:
ids (List[str]): tokens.
Returns:
str: text
"""
if input_format == "piece":
def decode(l):
return "".join(self.sp.DecodePieces(l))
elif input_format == "id":
def decode(l):
return "".join(self.sp.DecodeIds(l))
return decode(tokens)
def _load_vocabulary_from_file(self, vocab_filepath): def _load_vocabulary_from_file(self, vocab_filepath):
"""Load vocabulary from file.""" """Load vocabulary from file."""
vocab_lines = [] vocab_lines = []
with open(vocab_filepath, 'r', encoding='utf-8') as file: with open(vocab_filepath, 'r', encoding='utf-8') as file:
vocab_lines.extend(file.readlines()) vocab_lines.extend(file.readlines())
vocab_list = [line[:-1] for line in vocab_lines] vocab_list = [line[:-1] for line in vocab_lines]
vocab_dict = dict( id2token = dict(
[(token, id) for (id, token) in enumerate(vocab_list)]) [(idx, token) for (idx, token) in enumerate(vocab_list)])
return vocab_dict, vocab_list token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)])
return token2id, id2token, vocab_list

@ -233,22 +233,20 @@ class ManifestDataset(Dataset):
@property @property
def vocab_size(self): def vocab_size(self):
"""Return the vocabulary size.
:return: Vocabulary size.
:rtype: int
"""
return self._speech_featurizer.vocab_size return self._speech_featurizer.vocab_size
@property @property
def vocab_list(self): def vocab_list(self):
"""Return the vocabulary in list.
:return: Vocabulary in list.
:rtype: list
"""
return self._speech_featurizer.vocab_list return self._speech_featurizer.vocab_list
@property
def vocab_dict(self):
return self._speech_featurizer.vocab_dict
@property
def text_feature(self):
return self._speech_featurizer.text_feature
@property @property
def feature_size(self): def feature_size(self):
return self._speech_featurizer.feature_size return self._speech_featurizer.feature_size

@ -16,10 +16,11 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recogni
(https://arxiv.org/pdf/2012.05481.pdf) (https://arxiv.org/pdf/2012.05481.pdf)
""" """
import sys
from collections import defaultdict from collections import defaultdict
import logging import logging
from yacs.config import CfgNode from yacs.config import CfgNode
from typing import List, Optional, Tuple from typing import List, Optional, Tuple, Dict
import paddle import paddle
from paddle import jit from paddle import jit
@ -132,6 +133,7 @@ class U2BaseModel(nn.Module):
smoothing=lsm_weight, smoothing=lsm_weight,
normalize_length=length_normalized_loss, ) normalize_length=length_normalized_loss, )
@jit.export
def forward( def forward(
self, self,
speech: paddle.Tensor, speech: paddle.Tensor,
@ -158,7 +160,7 @@ class U2BaseModel(nn.Module):
encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
#TODO(Hui Zhang): sum not support bool type #TODO(Hui Zhang): sum not support bool type
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int64).sum( encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
1) #[B, 1, T] -> [B] 1) #[B, 1, T] -> [B]
# 2a. Attention-decoder branch # 2a. Attention-decoder branch
@ -301,14 +303,15 @@ class U2BaseModel(nn.Module):
# log scale score # log scale score
scores = paddle.to_tensor( scores = paddle.to_tensor(
[0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float) [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float)
scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( scores = scores.to(device).repeat(batch_size).unsqueeze(1).to(
device) # (B*N, 1) device) # (B*N, 1)
end_flag = paddle.zeros_like(scores, dtype=paddle.bool) # (B*N, 1) end_flag = paddle.zeros_like(scores, dtype=paddle.bool) # (B*N, 1)
cache: Optional[List[paddle.Tensor]] = None cache: Optional[List[paddle.Tensor]] = None
# 2. Decoder forward step by step # 2. Decoder forward step by step
for i in range(1, maxlen + 1): for i in range(1, maxlen + 1):
# Stop if all batch and all beam produce eos # Stop if all batch and all beam produce eos
if end_flag.sum() == running_size: # TODO(Hui Zhang): if end_flag.sum() == running_size:
if end_flag.cast(paddle.int64).sum() == running_size:
break break
# 2.1 Forward decoder step # 2.1 Forward decoder step
@ -333,7 +336,7 @@ class U2BaseModel(nn.Module):
# regard top_k_index as (B*N*N),regard offset_k_index as (B*N), # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
# then find offset_k_index in top_k_index # then find offset_k_index in top_k_index
base_k_index = paddle.arange(batch_size).view(-1, 1).repeat( base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
[1, beam_size]) # (B, N) 1, beam_size) # (B, N)
base_k_index = base_k_index * beam_size * beam_size base_k_index = base_k_index * beam_size * beam_size
best_k_index = base_k_index.view(-1) + offset_k_index.view( best_k_index = base_k_index.view(-1) + offset_k_index.view(
-1) # (B*N) -1) # (B*N)
@ -678,6 +681,108 @@ class U2BaseModel(nn.Module):
decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1) decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1)
return decoder_out return decoder_out
@paddle.no_grad()
def decode(self,
feats: paddle.Tensor,
feats_lengths: paddle.Tensor,
text_feature: Dict[str, int],
decoding_method: str,
lang_model_path: str,
beam_alpha: float,
beam_beta: float,
beam_size: int,
cutoff_prob: float,
cutoff_top_n: int,
num_processes: int,
ctc_weight: float=0.0,
decoding_chunk_size: int=-1,
num_decoding_left_chunks: int=-1,
simulate_streaming: bool=False):
"""u2 decoding.
Args:
feats (Tenosr): audio features, (B, T, D)
feats_lengths (Tenosr): (B)
text_feature (TextFeaturizer): text feature object.
decoding_method (str): decoding mode, e.g.
'attention', 'ctc_greedy_search',
'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path (str): lm path.
beam_alpha (float): lm weight.
beam_beta (float): length penalty.
beam_size (int): beam size for search
cutoff_prob (float): for prune.
cutoff_top_n (int): for prune.
num_processes (int):
ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0.
decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
<0: for decoding, use full chunk.
>0: for decoding, use fixed chunk size as set.
0: used for training, it's prohibited here.
num_decoding_left_chunks (int, optional):
number of left chunks for decoding. Defaults to -1.
simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
Raises:
ValueError: when not support decoding_method.
Returns:
List[List[int]]: transcripts.
"""
batch_size = feats.size(0)
if decoding_method in ['ctc_prefix_beam_search',
'attention_rescoring'] and batch_size > 1:
logger.fatal(
f'decoding mode {decoding_method} must be running with batch_size == 1'
)
sys.exit(1)
if decoding_method == 'attention':
hyps = self.recognize(
feats,
feats_lengths,
beam_size=beam_size,
decoding_chunk_size=decoding_chunk_size,
num_decoding_left_chunks=num_decoding_left_chunks,
simulate_streaming=simulate_streaming)
hyps = [hyp.tolist() for hyp in hyps]
elif decoding_method == 'ctc_greedy_search':
hyps = self.ctc_greedy_search(
feats,
feats_lengths,
decoding_chunk_size=decoding_chunk_size,
num_decoding_left_chunks=num_decoding_left_chunks,
simulate_streaming=simulate_streaming)
# ctc_prefix_beam_search and attention_rescoring only return one
# result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode
elif decoding_method == 'ctc_prefix_beam_search':
assert feats.size(0) == 1
hyp = self.ctc_prefix_beam_search(
feats,
feats_lengths,
beam_size,
decoding_chunk_size=decoding_chunk_size,
num_decoding_left_chunks=num_decoding_left_chunks,
simulate_streaming=simulate_streaming)
hyps = [hyp]
elif decoding_method == 'attention_rescoring':
assert feats.size(0) == 1
hyp = self.attention_rescoring(
feats,
feats_lengths,
beam_size,
decoding_chunk_size=decoding_chunk_size,
num_decoding_left_chunks=num_decoding_left_chunks,
ctc_weight=ctc_weight,
simulate_streaming=simulate_streaming)
hyps = [hyp]
else:
raise ValueError(f"Not support decoding method: {decoding_method}")
res = [text_feature.defeaturize(hyp) for hyp in hyps]
return res
class U2Model(U2BaseModel): class U2Model(U2BaseModel):
def __init__(self, configs: dict): def __init__(self, configs: dict):
@ -779,14 +884,24 @@ class U2InferModel(U2Model):
def __init__(self, configs: dict): def __init__(self, configs: dict):
super().__init__(configs) super().__init__(configs)
def forward(self, audio, audio_len): def forward(self,
feats,
feats_lengths,
decoding_chunk_size=-1,
num_decoding_left_chunks=-1,
simulate_streaming=False):
"""export model function """export model function
Args: Args:
audio (Tensor): [B, T, D] feats (Tensor): [B, T, D]
audio_len (Tensor): [B] feats_lengths (Tensor): [B]
Returns: Returns:
probs: probs after softmax List[List[int]]: best path result
""" """
raise NotImplementedError("U2Model infer") return self.ctc_greedy_search(
feats,
feats_lengths,
decoding_chunk_size=decoding_chunk_size,
num_decoding_left_chunks=num_decoding_left_chunks,
simulate_streaming=simulate_streaming)

@ -224,9 +224,28 @@ class CTCDecoder(nn.Layer):
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
lang_model_path, beam_alpha, beam_beta, beam_size, lang_model_path, beam_alpha, beam_beta, beam_size,
cutoff_prob, cutoff_top_n, num_processes): cutoff_prob, cutoff_top_n, num_processes):
""" probs: activation after softmax """ctc decoding with probs.
logits_len: audio output lens
Args:
probs (Tenosr): activation after softmax
logits_lens (Tenosr): audio output lens
vocab_list ([type]): [description]
decoding_method ([type]): [description]
lang_model_path ([type]): [description]
beam_alpha ([type]): [description]
beam_beta ([type]): [description]
beam_size ([type]): [description]
cutoff_prob ([type]): [description]
cutoff_top_n ([type]): [description]
num_processes ([type]): [description]
Raises:
ValueError: when decoding_method not support.
Returns:
List[str]: transcripts.
""" """
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
if decoding_method == "ctc_greedy": if decoding_method == "ctc_greedy":
result_transcripts = self._decode_batch_greedy( result_transcripts = self._decode_batch_greedy(

@ -176,5 +176,5 @@ class TransformerDecoder(nn.Module):
else: else:
y = x[:, -1] y = x[:, -1]
if self.use_output_layer: if self.use_output_layer:
y = paddle.log_softmax(self.output_layer(y), dim=-1) y = paddle.log_softmax(self.output_layer(y), axis=-1)
return y, new_cache return y, new_cache

@ -101,12 +101,17 @@ class DecoderLayer(nn.Module):
tgt_q_mask = tgt_mask tgt_q_mask = tgt_mask
else: else:
# compute only the last frame query keeping dim: max_time_out -> 1 # compute only the last frame query keeping dim: max_time_out -> 1
assert cache.shape == ( assert cache.shape == [
tgt.shape[0], tgt.shape[1] - 1, self.size, tgt.shape[0],
), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" tgt.shape[1] - 1,
self.size,
], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}"
tgt_q = tgt[:, -1:, :] tgt_q = tgt[:, -1:, :]
residual = residual[:, -1:, :] residual = residual[:, -1:, :]
tgt_q_mask = tgt_mask[:, -1:, :] # TODO(Hui Zhang): slice not support bool type
# tgt_q_mask = tgt_mask[:, -1:, :]
tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast(
paddle.bool)
if self.concat_after: if self.concat_after:
tgt_concat = paddle.cat( tgt_concat = paddle.cat(

@ -41,6 +41,7 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
[[1., 1., 0., 0.], [[1., 1., 0., 0.],
[1., 1., 1., 1.]] [1., 1., 1., 1.]]
""" """
assert x_len.dim() == 1
max_len = max_len or x_len.max() max_len = max_len or x_len.max()
x_len = paddle.unsqueeze(x_len, -1) x_len = paddle.unsqueeze(x_len, -1)
row_vector = paddle.arange(max_len) row_vector = paddle.arange(max_len)
@ -65,6 +66,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
[0, 0, 0, 1, 1], [0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]] [0, 0, 1, 1, 1]]
""" """
assert lengths.dim() == 1
batch_size = int(lengths.shape[0]) batch_size = int(lengths.shape[0])
max_len = int(lengths.max()) max_len = int(lengths.max())
seq_range = paddle.arange(0, max_len, dtype=paddle.int64) seq_range = paddle.arange(0, max_len, dtype=paddle.int64)

@ -57,6 +57,9 @@ def default_argument_parser():
# save jit model to # save jit model to
parser.add_argument("--export_path", type=str, help="path of the jit model to save") parser.add_argument("--export_path", type=str, help="path of the jit model to save")
# save asr result to
parser.add_argument("--result_file", type=str, help="path of save the asr result")
# running # running
parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"], parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
help="device type to use, cpu and gpu are supported.") help="device type to use, cpu and gpu are supported.")

@ -126,16 +126,24 @@ training:
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 100
decoding: decoding:
batch_size: 128 batch_size: 16
error_rate_type: wer error_rate_type: wer
decoding_method: ctc_beam_search decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5 alpha: 2.5
beta: 0.3 beta: 0.3
beam_size: 500 beam_size: 10
cutoff_prob: 1.0 cutoff_prob: 1.0
cutoff_top_n: 40 cutoff_top_n: 0
num_proc_bsearch: 8 num_proc_bsearch: 8
ctc_weight: 0.0 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -11,6 +11,7 @@ python3 -u ${BIN_DIR}/test.py \
--device 'gpu' \ --device 'gpu' \
--nproc 1 \ --nproc 1 \
--config conf/conformer.yaml \ --config conf/conformer.yaml \
--result_file data/asr.result \
--output ckpt --output ckpt
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -17,14 +17,23 @@ import numpy as np
import unittest import unittest
from deepspeech.modules.mask import sequence_mask from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.mask import make_non_pad_mask from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.modules.mask import make_pad_mask
class TestU2Model(unittest.TestCase): class TestU2Model(unittest.TestCase):
def setUp(self): def setUp(self):
paddle.set_device('cpu') paddle.set_device('cpu')
self.lengths = paddle.to_tensor([5, 3, 2]) self.lengths = paddle.to_tensor([5, 3, 2])
self.masks = np.array( self.masks = np.array([
[[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [1, 1, 0, 0, 0]], ) [1, 1, 1, 1, 1],
[1, 1, 1, 0, 0],
[1, 1, 0, 0, 0],
])
self.pad_masks = np.array([
[0, 0, 0, 0, 0],
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1],
])
def test_sequence_mask(self): def test_sequence_mask(self):
res = sequence_mask(self.lengths) res = sequence_mask(self.lengths)
@ -32,7 +41,13 @@ class TestU2Model(unittest.TestCase):
def test_make_non_pad_mask(self): def test_make_non_pad_mask(self):
res = make_non_pad_mask(self.lengths) res = make_non_pad_mask(self.lengths)
res1 = sequence_mask(self.lengths)
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist()) self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
def test_make_pad_mask(self):
res = make_pad_mask(self.lengths)
self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist())
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save