add aishell eg & add json parser & add write cmvn binary

pull/1676/head
Yang Zhou 2 years ago
parent e75b906e11
commit ec33f8d73b

@ -2,7 +2,9 @@
data=$1
feat_scp=$2
numsplit=$3
split_feat_name=$3
numsplit=$4
if ! [ "$numsplit" -gt 0 ]; then
echo "Invalid num-split argument";
@ -10,7 +12,7 @@ if ! [ "$numsplit" -gt 0 ]; then
fi
directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/feats.scp; done)
feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
echo $feat_split_scp
# if this mkdir fails due to argument-list being too long, iterate.
if ! mkdir -p $directories >&/dev/null; then

@ -22,54 +22,60 @@ if [ ! -d ../paddle_asr_model ]; then
fi
mkdir -p data
if [ ! -d ./test ]; then
data=$PWD/data
aishell_wav_scp=aishell_test.scp
if [ ! -d $data/test ]; then
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
realpath ./test/*/*.wav > wavlist
awk -F '/' '{ print $(NF) }' wavlist | awk -F '.' '{ print $1 }' > utt_id
paste utt_id wavlist > aishell_test.scp
unzip -d $data aishell_test.zip
realpath $data/test/*/*.wav > $data/wavlist
awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
fi
if [ ! -d aishell_ds2_online_model ]; then
mkdir -p aishell_ds2_online_model
wget -P ./aishell_ds2_online_model -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz
tar xzfv ./aishell_ds2_online_model/aishell_ds2_online_cer8.00_release.tar.gz -C ./aishell_ds2_online_model
model_dir=$PWD/aishell_ds2_online_model
if [ ! -d $model_dir ]; then
mkdir -p $model_dir
wget -P $model_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir
fi
# 3. make feature
aishell_wav_scp=./aishell_test.scp
aishell_online_model=./aishell_ds2_online_model/exp/deepspeech2_online/checkpoints
model_dir=../paddle_asr_model
feat_ark=./feats.ark
feat_scp=./aishell_feat.scp
cmvn=./cmvn.ark
aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints
lm_model_dir=../paddle_asr_model
label_file=./aishell_result
wer=./aishell_wer
nj=40
export GLOG_logtostderr=1
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
data=$PWD/data
# 3. gen linear feat
linear_spectrogram_main \
--wav_rspecifier=scp:$aishell_wav_scp \
--feature_wspecifier=ark,scp:$feat_ark,$feat_scp \
--cmvn_write_path=$cmvn \
--streaming_chunk=10
cmvn=$PWD/cmvn.ark
cmvn_json2binary_main --json_file=$model_dir/data/mean_std.json --cmvn_write_path=$cmvn
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat_log \
linear_spectrogram_without_db_norm_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
--cmvn_file=$cmvn \
--streaming_chunk=0.36
nj=10
data=./data
text=./test/text
# recognizer
./local/split_data.sh data aishell_feat.scp $nj
text=$data/test/text
# 4. recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
offline_decoder_sliding_chunk_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feats.scp \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$aishell_online_model/avg_1.jit.pdmodel \
--param_path=$aishell_online_model/avg_1.jit.pdiparams \
--dict_file=$model_dir/vocab.txt \
--lm_path=$model_dir/avg_1.jit.klm \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--dict_file=$lm_model_dir/vocab.txt \
--lm_path=$lm_model_dir/avg_1.jit.klm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result
cat $data/split${nj}/*/result > $label_file
local/compute-wer.py --char=1 --v=1 $label_file $text > $wer
tail $wer

@ -34,6 +34,12 @@ DEFINE_int32(receptive_field_length,
DEFINE_int32(downsampling_rate,
4,
"two CNN(kernel=5) module downsampling rate.");
DEFINE_string(model_output_names,
"save_infer_model/scale_0.tmp_1,save_infer_model/"
"scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
"scale_3.tmp_1",
"model output names");
DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
using kaldi::BaseFloat;
using kaldi::Matrix;
@ -68,7 +74,8 @@ int main(int argc, char* argv[]) {
ppspeech::ModelOptions model_opts;
model_opts.model_path = model_graph;
model_opts.params_path = model_params;
model_opts.cache_shape = "5-1-1024,5-1-1024";
model_opts.cache_shape = FLAGS_model_cache_names;
model_opts.output_names = FLAGS_model_output_names;
std::shared_ptr<ppspeech::PaddleNnet> nnet(
new ppspeech::PaddleNnet(model_opts));
std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());

@ -7,4 +7,12 @@ target_link_libraries(mfcc-test kaldi-mfcc)
add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc)
target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog)
add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc)
target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog)

@ -182,6 +182,7 @@ int main(int argc, char* argv[]) {
ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20;
opt.frame_opts.frame_shift_ms = 10;
opt.streaming_chunk = FLAGS_streaming_chunk;
opt.frame_opts.dither = 0.0;
opt.frame_opts.remove_dc_offset = false;
opt.frame_opts.window_type = "hanning";
@ -257,6 +258,7 @@ int main(int argc, char* argv[]) {
}
}
feat_writer.Write(utt, features);
feature_cache.Reset();
if (num_done % 50 == 0 && num_done != 0)
KALDI_VLOG(2) << "Processed " << num_done << " utterances";

@ -21,15 +21,20 @@ using kaldi::BaseFloat;
using kaldi::VectorBase;
using kaldi::Vector;
AudioCache::AudioCache(int buffer_size)
AudioCache::AudioCache(int buffer_size, bool convert2PCM32)
: finished_(false),
capacity_(buffer_size),
size_(0),
offset_(0),
timeout_(1) {
timeout_(1),
convert2PCM32_(convert2PCM32) {
ring_buffer_.resize(capacity_);
}
BaseFloat AudioCache::Convert2PCM32(BaseFloat val) {
return val * (1. / std::pow(2.0, 15));
}
void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
std::unique_lock<std::mutex> lock(mutex_);
while (size_ + waves.Dim() > ring_buffer_.size()) {
@ -38,6 +43,8 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
for (size_t idx = 0; idx < waves.Dim(); ++idx) {
int32 buffer_idx = (idx + offset_) % ring_buffer_.size();
ring_buffer_[buffer_idx] = waves(idx);
if (convert2PCM32_)
ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
}
size_ += waves.Dim();
}

@ -23,7 +23,8 @@ namespace ppspeech {
// waves cache
class AudioCache : public FrontendInterface {
public:
explicit AudioCache(int buffer_size = 100*kint16max);
explicit AudioCache(int buffer_size = 1000 * kint16max,
bool convert2PCM32 = false);
virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
@ -46,6 +47,8 @@ class AudioCache : public FrontendInterface {
}
private:
kaldi::BaseFloat Convert2PCM32(kaldi::BaseFloat val);
std::vector<kaldi::BaseFloat> ring_buffer_;
size_t offset_; // offset in ring_buffer_
size_t size_; // samples in ring_buffer_ now
@ -54,6 +57,7 @@ class AudioCache : public FrontendInterface {
mutable std::mutex mutex_;
std::condition_variable ready_feed_condition_;
kaldi::int32 timeout_; // millisecond
bool convert2PCM32_;
DISALLOW_COPY_AND_ASSIGN(AudioCache);
};

@ -46,7 +46,10 @@ class LinearSpectrogram : public FrontendInterface {
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { base_extractor_->Reset(); }
virtual void Reset() {
base_extractor_->Reset();
reminded_wav_.Resize(0);
}
private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,

@ -1,4 +1,5 @@
add_library(utils
file_utils.cc
simdjson.cpp
)

@ -31,4 +31,14 @@ bool ReadFileToVector(const std::string& filename,
return true;
}
}
std::string ReadFile2String(const std::string& path) {
std::ifstream input_file(path);
if (!input_file.is_open()) {
std::cerr << "please input a valid file" << std::endl;
}
return std::string((std::istreambuf_iterator<char>(input_file)),
std::istreambuf_iterator<char>());
}
}

@ -18,4 +18,7 @@ namespace ppspeech {
bool ReadFileToVector(const std::string& filename,
std::vector<std::string>* data);
std::string ReadFile2String(const std::string& path);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save