pull/4021/head
co63oc 6 months ago
parent 201275e7d2
commit 6d330bd477

@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors # assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0] # in sequences are same and fetching those from sequences[0]
max_size = paddle.shape(sequences[0]) max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not support `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = tuple( trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
@ -93,7 +93,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
length = tensor.shape[0] length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor # use index notation to prevent duplicate references to the tensor
if batch_first: if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# TODO (Hui Zhang): set_value op not support int16 # TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor # out_tensor[i, :length, ...] = tensor
@ -102,7 +102,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
else: else:
out_tensor[i, length] = tensor out_tensor[i, length] = tensor
else: else:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# out_tensor[:length, i, ...] = tensor # out_tensor[:length, i, ...] = tensor
if length != 0: if length != 0:
out_tensor[:length, i] = tensor out_tensor[:length, i] = tensor

@ -80,7 +80,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors # assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0] # in sequences are same and fetching those from sequences[0]
max_size = paddle.shape(sequences[0]) max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not support `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = tuple( trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
@ -98,7 +98,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}" f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
) )
if batch_first: if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# TODO (Hui Zhang): set_value op not support int16 # TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor # out_tensor[i, :length, ...] = tensor
@ -107,7 +107,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
else: else:
out_tensor[i, length] = tensor out_tensor[i, length] = tensor
else: else:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# out_tensor[:length, i, ...] = tensor # out_tensor[:length, i, ...] = tensor
if length != 0: if length != 0:
out_tensor[:length, i] = tensor out_tensor[:length, i] = tensor

@ -156,8 +156,8 @@ class Analysis:
return self.text[self.pos] return self.text[self.pos]
#判断该字符是否是中文字符(不包括中文标点) #判断该字符是否是中文字符(不包括中文标点)
def isChineseChar(self, charater): def isChineseChar(self, character):
return 0x4e00 <= ord(charater) < 0x9fa6 return 0x4e00 <= ord(character) < 0x9fa6
#判断是否是ASCII码 #判断是否是ASCII码
def isASCIIChar(self, ch): def isASCIIChar(self, ch):

@ -66,8 +66,8 @@ config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
echo "Sevice ip: $server_ip" | tee ./log/test_result.log echo "Service ip: $server_ip" | tee ./log/test_result.log
echo "Sevice port: $port" | tee -a ./log/test_result.log echo "Service port: $port" | tee -a ./log/test_result.log
# whether a process is listening on $port # whether a process is listening on $port
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'` pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
@ -190,7 +190,7 @@ echo "**************************************************************************
echo "All tests completed." | tee -a ./log/test_result.log echo "All tests completed." | tee -a ./log/test_result.log
# sohw all the test results # show all the test results
echo "***************** Here are all the test results ********************" echo "***************** Here are all the test results ********************"
cat ./log/test_result.log cat ./log/test_result.log

@ -76,8 +76,8 @@ config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
echo "Sevice ip: $server_ip" | tee $log/test_result.log echo "Service ip: $server_ip" | tee $log/test_result.log
echo "Sevice port: $port" | tee -a $log/test_result.log echo "Service port: $port" | tee -a $log/test_result.log
# whether a process is listening on $port # whether a process is listening on $port
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'` pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
@ -307,7 +307,7 @@ echo "**************************************************************************
echo "All tests completed." | tee -a $log/test_result.log echo "All tests completed." | tee -a $log/test_result.log
# sohw all the test results # show all the test results
echo "***************** Here are all the test results ********************" echo "***************** Here are all the test results ********************"
cat $log/test_result.log cat $log/test_result.log

@ -23,7 +23,7 @@ cd ..
( (
[ ! -z "${LIBLBFGS}" ] && \ [ ! -z "${LIBLBFGS}" ] && \
echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \ echo >&2 "LIBLBFGS variable is already defined. Undefining..." && \
unset LIBLBFGS unset LIBLBFGS
[ -f ./env.sh ] && . ./env.sh [ -f ./env.sh ] && . ./env.sh

@ -68,7 +68,7 @@ make || exit
cd .. cd ..
( (
[ ! -z "${SRILM}" ] && \ [ ! -z "${SRILM}" ] && \
echo >&2 "SRILM variable is aleady defined. Undefining..." && \ echo >&2 "SRILM variable is already defined. Undefining..." && \
unset SRILM unset SRILM
[ -f ./env.sh ] && . ./env.sh [ -f ./env.sh ] && . ./env.sh

@ -32,7 +32,7 @@ def main(args):
# leaving `token` # leaving `token`
print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>')) print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
node += 1 node += 1
# Fianl node # Final node
print('0') print('0')

@ -21,7 +21,7 @@ cp -r $src_lang $tgt_lang
# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0. # eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0.
# s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor. # s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
# G.fst, the disambiguation symbol #0 only appears on the input side # G.fst, the disambiguation symbol #0 only appears on the input side
# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`. # do eps2disambig.pl and s2eps.pl maybe just for following `fstrmepsilon`.
cat $arpa_lm | \ cat $arpa_lm | \
grep -v '<s> <s>' | \ grep -v '<s> <s>' | \
grep -v '</s> <s>' | \ grep -v '</s> <s>' | \

@ -3,7 +3,7 @@
''' '''
Merge training configs into a single inference config. Merge training configs into a single inference config.
The single inference config is for CLI, which only takes a single config to do inferencing. The single inference config is for CLI, which only takes a single config to do inferencing.
The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file. The training configs includes: model config, preprocess config, decode config, vocab file and cmvn file.
Process: Process:
# step 1: prepare dir # step 1: prepare dir
@ -11,7 +11,7 @@
cp -r exp conf data release_dir cp -r exp conf data release_dir
cd release_dir cd release_dir
# step 2: get "model.yaml" which conatains all configuration info. # step 2: get "model.yaml" which contains all configuration info.
# if does not contain preprocess.yaml file. e.g ds2: # if does not contain preprocess.yaml file. e.g ds2:
python generate_infer_yaml.py --cfg_pth conf/deepspeech2_online.yaml --dcd_pth conf/tuning/chunk_decode.yaml --vb_pth data/lang_char/vocab.txt --cmvn_pth data/mean_std.json --save_pth model.yaml --pre_pth null python generate_infer_yaml.py --cfg_pth conf/deepspeech2_online.yaml --dcd_pth conf/tuning/chunk_decode.yaml --vb_pth data/lang_char/vocab.txt --cmvn_pth data/mean_std.json --save_pth model.yaml --pre_pth null
# if contains preprocess.yaml file. e.g u2: # if contains preprocess.yaml file. e.g u2:

@ -37,7 +37,7 @@ fi
# the text should be properly pre-processed, e.g: # the text should be properly pre-processed, e.g:
# cleand, normalized and possibly word-segmented # cleand, normalized and possibly word-segmented
# get rid off irrelavent symbols # get rid off irrelevant symbols
grep -v '<eps>' $symbol_table \ grep -v '<eps>' $symbol_table \
| grep -v '#0' \ | grep -v '#0' \
| grep -v '<unk>' | grep -v '<UNK>' \ | grep -v '<unk>' | grep -v '<UNK>' \
@ -51,7 +51,7 @@ grep -v '<eps>' $symbol_table \
# #
# TL;DR reason: # TL;DR reason:
# Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option # Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option
# spcifies a *valid* set of vocabulary, whereas *valid but unseen* # specifies a *valid* set of vocabulary, whereas *valid but unseen*
# words are discarded in final arpa. # words are discarded in final arpa.
# So the trick is, # So the trick is,
# we explicitly add kaldi's vocab(one word per line) to training text, # we explicitly add kaldi's vocab(one word per line) to training text,

@ -1288,7 +1288,7 @@ def normalize_corpus(corpus,
def char_token(s: Text) -> List[Text]: def char_token(s: Text) -> List[Text]:
"""chinese charactor """chinese character
Args: Args:
s (Text): "我爱中国“ s (Text): "我爱中国“

Loading…
Cancel
Save