diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt index 7f1543c2..3c274a20 100644 --- a/speechx/examples/CMakeLists.txt +++ b/speechx/examples/CMakeLists.txt @@ -1,7 +1,4 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) -add_subdirectory(feat) -add_subdirectory(nnet) -add_subdirectory(decoder) - -add_subdirectory(glog) \ No newline at end of file +add_subdirectory(ds2_ol) +add_subdirectory(dev) \ No newline at end of file diff --git a/speechx/examples/aishell/local/compute-wer.py b/speechx/examples/aishell/local/compute-wer.py deleted file mode 100755 index a3eefc0d..00000000 --- a/speechx/examples/aishell/local/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/speechx/examples/aishell/utils b/speechx/examples/aishell/utils deleted file mode 120000 index 973afe67..00000000 --- a/speechx/examples/aishell/utils +++ /dev/null @@ -1 +0,0 @@ -../../../utils \ No newline at end of file diff --git a/speechx/examples/decoder/CMakeLists.txt b/speechx/examples/decoder/CMakeLists.txt deleted file mode 100644 index d446a671..00000000 --- a/speechx/examples/decoder/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -add_executable(offline_decoder_sliding_chunk_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_sliding_chunk_main.cc) -target_include_directories(offline_decoder_sliding_chunk_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(offline_decoder_sliding_chunk_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) - -add_executable(offline_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_main.cc) -target_include_directories(offline_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(offline_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) - -add_executable(offline_wfst_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_wfst_decoder_main.cc) -target_include_directories(offline_wfst_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(offline_wfst_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS}) - -add_executable(decoder_test_main ${CMAKE_CURRENT_SOURCE_DIR}/decoder_test_main.cc) -target_include_directories(decoder_test_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(decoder_test_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) - diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc deleted file mode 100644 index 9a9c14a0..00000000 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// todo refactor, repalce with gtest - -#include "base/flags.h" -#include "base/log.h" -#include "decoder/ctc_beam_search_decoder.h" -#include "frontend/audio/data_cache.h" -#include "kaldi/util/table-types.h" -#include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" - -DEFINE_string(feature_respecifier, "", "feature matrix rspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); -DEFINE_string(lm_path, "lm.klm", "language model"); -DEFINE_int32(chunk_size, 35, "feat chunk size"); - - -using kaldi::BaseFloat; -using kaldi::Matrix; -using std::vector; - -// test decoder by feeding speech feature, deprecated. -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - kaldi::SequentialBaseFloatMatrixReader feature_reader( - FLAGS_feature_respecifier); - std::string model_graph = FLAGS_model_path; - std::string model_params = FLAGS_param_path; - std::string dict_file = FLAGS_dict_file; - std::string lm_path = FLAGS_lm_path; - int32 chunk_size = FLAGS_chunk_size; - LOG(INFO) << "model path: " << model_graph; - LOG(INFO) << "model param: " << model_params; - LOG(INFO) << "dict path: " << dict_file; - LOG(INFO) << "lm path: " << lm_path; - LOG(INFO) << "chunk size (frame): " << chunk_size; - - int32 num_done = 0, num_err = 0; - - // frontend + nnet is decodable - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.params_path = model_params; - std::shared_ptr nnet( - new ppspeech::PaddleNnet(model_opts)); - std::shared_ptr raw_data(new ppspeech::DataCache()); - std::shared_ptr decodable( - new ppspeech::Decodable(nnet, raw_data)); - LOG(INFO) << "Init decodeable."; - - // init decoder - ppspeech::CTCBeamSearchOptions opts; - opts.dict_file = dict_file; - opts.lm_path = lm_path; - ppspeech::CTCBeamSearch decoder(opts); - LOG(INFO) << "Init decoder."; - - decoder.InitDecoder(); - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - const kaldi::Matrix feature = feature_reader.Value(); - LOG(INFO) << "utt: " << utt; - - // feat dim - raw_data->SetDim(feature.NumCols()); - LOG(INFO) << "dim: " << raw_data->Dim(); - - int32 row_idx = 0; - int32 num_chunks = feature.NumRows() / chunk_size; - LOG(INFO) << "n chunks: " << num_chunks; - for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { - // feat chunk - kaldi::Vector feature_chunk(chunk_size * - feature.NumCols()); - for (int row_id = 0; row_id < chunk_size; ++row_id) { - kaldi::SubVector feat_one_row(feature, - row_idx); - kaldi::SubVector f_chunk_tmp( - feature_chunk.Data() + row_id * feature.NumCols(), - feature.NumCols()); - f_chunk_tmp.CopyFromVec(feat_one_row); - row_idx++; - } - // feed to raw cache - raw_data->Accept(feature_chunk); - if (chunk_idx == num_chunks - 1) { - raw_data->SetFinished(); - } - // decode step - decoder.AdvanceDecode(decodable); - } - - std::string result; - result = decoder.GetFinalBestPath(); - KALDI_LOG << " the result of " << utt << " is " << result; - decodable->Reset(); - decoder.Reset(); - ++num_done; - } - - KALDI_LOG << "Done " << num_done << " utterances, " << num_err - << " with errors."; - return (num_done != 0 ? 0 : 1); -} diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh deleted file mode 100755 index ddda8970..00000000 --- a/speechx/examples/decoder/run.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -set +x -set -e - -. path.sh - -# 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then - pushd ${SPEECHX_ROOT} - bash build.sh - popd -fi - - -# 2. download model -if [ ! -d ../paddle_asr_model ]; then - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz - tar xzfv paddle_asr_model.tar.gz - mv ./paddle_asr_model ../ - # produce wav scp - echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp -fi - -model_dir=../paddle_asr_model -feat_wspecifier=./feats.ark -cmvn=./cmvn.ark - - -export GLOG_logtostderr=1 - -# 3. gen linear feat -linear_spectrogram_main \ - --wav_rspecifier=scp:$model_dir/wav.scp \ - --feature_wspecifier=ark,t:$feat_wspecifier \ - --cmvn_write_path=$cmvn - -# 4. run decoder -offline_decoder_main \ - --feature_respecifier=ark:$feat_wspecifier \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --param_path=$model_dir/avg_1.jit.pdparams \ - --dict_file=$model_dir/vocab.txt \ - --lm_path=$model_dir/avg_1.jit.klm diff --git a/speechx/examples/dev/CMakeLists.txt b/speechx/examples/dev/CMakeLists.txt new file mode 100644 index 00000000..c8445fb8 --- /dev/null +++ b/speechx/examples/dev/CMakeLists.txt @@ -0,0 +1,3 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +add_subdirectory(glog) diff --git a/speechx/examples/glog/CMakeLists.txt b/speechx/examples/dev/glog/CMakeLists.txt similarity index 100% rename from speechx/examples/glog/CMakeLists.txt rename to speechx/examples/dev/glog/CMakeLists.txt diff --git a/speechx/examples/glog/README.md b/speechx/examples/dev/glog/README.md similarity index 100% rename from speechx/examples/glog/README.md rename to speechx/examples/dev/glog/README.md diff --git a/speechx/examples/glog/glog_logtostderr_test.cc b/speechx/examples/dev/glog/glog_logtostderr_test.cc similarity index 100% rename from speechx/examples/glog/glog_logtostderr_test.cc rename to speechx/examples/dev/glog/glog_logtostderr_test.cc diff --git a/speechx/examples/glog/glog_test.cc b/speechx/examples/dev/glog/glog_test.cc similarity index 100% rename from speechx/examples/glog/glog_test.cc rename to speechx/examples/dev/glog/glog_test.cc diff --git a/speechx/examples/nnet/path.sh b/speechx/examples/dev/glog/path.sh similarity index 85% rename from speechx/examples/nnet/path.sh rename to speechx/examples/dev/glog/path.sh index f70e70ee..1a96a861 100644 --- a/speechx/examples/nnet/path.sh +++ b/speechx/examples/dev/glog/path.sh @@ -1,14 +1,15 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../.. -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples +SPEECHX_ROOT=$PWD/../../../ SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } -export LC_AL=C +SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples +[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } -SPEECHX_BIN=$SPEECHX_EXAMPLES/nnet +SPEECHX_BIN=$SPEECHX_EXAMPLES/dev/glog export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN + +export LC_AL=C diff --git a/speechx/examples/glog/run.sh b/speechx/examples/dev/glog/run.sh similarity index 100% rename from speechx/examples/glog/run.sh rename to speechx/examples/dev/glog/run.sh diff --git a/speechx/examples/ds2_ol/CMakeLists.txt b/speechx/examples/ds2_ol/CMakeLists.txt new file mode 100644 index 00000000..89cbd0ef --- /dev/null +++ b/speechx/examples/ds2_ol/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +add_subdirectory(feat) +add_subdirectory(nnet) +add_subdirectory(decoder) \ No newline at end of file diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md new file mode 100644 index 00000000..4904a883 --- /dev/null +++ b/speechx/examples/ds2_ol/README.md @@ -0,0 +1,11 @@ +# Deepspeech2 Streaming + +Please go to `aishell` to test it. + +* aishell +Deepspeech2 Streaming Decoding under aishell dataset. + +The below is for developing and offline testing: +* nnet +* feat +* decoder \ No newline at end of file diff --git a/speechx/examples/aishell/local/split_data.sh b/speechx/examples/ds2_ol/aishell/local/split_data.sh similarity index 100% rename from speechx/examples/aishell/local/split_data.sh rename to speechx/examples/ds2_ol/aishell/local/split_data.sh diff --git a/speechx/examples/decoder/path.sh b/speechx/examples/ds2_ol/aishell/path.sh similarity index 94% rename from speechx/examples/decoder/path.sh rename to speechx/examples/ds2_ol/aishell/path.sh index a0e7c9ae..b74ffbec 100644 --- a/speechx/examples/decoder/path.sh +++ b/speechx/examples/ds2_ol/aishell/path.sh @@ -1,6 +1,6 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../.. +SPEECHX_ROOT=$PWD/../../../ SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples SPEECHX_TOOLS=$SPEECHX_ROOT/tools diff --git a/speechx/examples/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh similarity index 82% rename from speechx/examples/aishell/run.sh rename to speechx/examples/ds2_ol/aishell/run.sh index 8a16a865..a002ebb8 100755 --- a/speechx/examples/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -11,18 +11,20 @@ if [ ! -d ${SPEECHX_EXAMPLES} ]; then popd fi - -# 2. download model -if [ ! -d ../paddle_asr_model ]; then - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz - tar xzfv paddle_asr_model.tar.gz - mv ./paddle_asr_model ../ - # produce wav scp - echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp -fi - +# input mkdir -p data data=$PWD/data + +ckpt_dir=$data/model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ +vocb_dir=$ckpt_dir/data/lang_char/ + +lm=$data/zh_giga.no_cna_cmn.prune01244.klm + +# output +mkdir -p exp +exp=$PWD/exp + aishell_wav_scp=aishell_test.scp if [ ! -d $data/test ]; then wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip @@ -39,9 +41,15 @@ if [ ! -d $model_dir ]; then tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir fi +if [ ! -f $lm ]; then + pushd $data + wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm + popd +fi + + # 3. make feature aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints -lm_model_dir=../paddle_asr_model label_file=./aishell_result wer=./aishell_wer @@ -71,11 +79,11 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \ --model_path=$aishell_online_model/avg_1.jit.pdmodel \ --param_path=$aishell_online_model/avg_1.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --dict_file=$lm_model_dir/vocab.txt \ + --dict_file=$vocb_dir/vocab.txt \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result cat $data/split${nj}/*/result > ${label_file} -local/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer} +utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer} # 4. decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_lm \ @@ -84,12 +92,14 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_lm \ --model_path=$aishell_online_model/avg_1.jit.pdmodel \ --param_path=$aishell_online_model/avg_1.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --dict_file=$lm_model_dir/vocab.txt \ - --lm_path=$lm_model_dir/avg_1.jit.klm \ + --dict_file=$vocb_dir/vocab.txt \ + --lm_path=$lm \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm + cat $data/split${nj}/*/result_lm > ${label_file}_lm -local/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm +utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm + graph_dir=./aishell_graph if [ ! -d $ ]; then @@ -97,6 +107,7 @@ if [ ! -d $ ]; then unzip -d aishell_graph.zip fi + # 5. test TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_tlg \ offline_wfst_decoder_main \ @@ -109,5 +120,6 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_tlg \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg + cat $data/split${nj}/*/result_tlg > ${label_file}_tlg -local/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg \ No newline at end of file +utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg \ No newline at end of file diff --git a/speechx/examples/ds2_ol/aishell/utils b/speechx/examples/ds2_ol/aishell/utils new file mode 120000 index 00000000..c2519a9d --- /dev/null +++ b/speechx/examples/ds2_ol/aishell/utils @@ -0,0 +1 @@ +../../../../utils/ \ No newline at end of file diff --git a/speechx/examples/ds2_ol/decoder/.gitignore b/speechx/examples/ds2_ol/decoder/.gitignore new file mode 100644 index 00000000..bbd86a25 --- /dev/null +++ b/speechx/examples/ds2_ol/decoder/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/ds2_ol/decoder/CMakeLists.txt b/speechx/examples/ds2_ol/decoder/CMakeLists.txt new file mode 100644 index 00000000..6139ebfa --- /dev/null +++ b/speechx/examples/ds2_ol/decoder/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +set(bin_name ctc-prefix-beam-search-decoder-ol) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + + +set(bin_name wfst-decoder-ol) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS}) + + +set(bin_name nnet-logprob-decoder-test) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + diff --git a/speechx/examples/ds2_ol/decoder/README.md b/speechx/examples/ds2_ol/decoder/README.md new file mode 100644 index 00000000..be5b0f9b --- /dev/null +++ b/speechx/examples/ds2_ol/decoder/README.md @@ -0,0 +1,12 @@ +# ASR Decoder + +ASR Decoder test bins. We using theses bins to test CTC BeamSearch decoder and WFST decoder. + +* decoder_test_main.cc +feed nnet output logprob, and only test decoder + +* offline_decoder_sliding_chunk_main.cc +feed streaming audio feature, decode as streaming manner. + +* offline_wfst_decoder_main.cc +feed streaming audio feature, decode using WFST as streaming manner. \ No newline at end of file diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc similarity index 94% rename from speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc rename to speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc index 40092de3..20034efc 100644 --- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc +++ b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc @@ -34,10 +34,11 @@ DEFINE_int32(receptive_field_length, DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); +DEFINE_string(model_input_names, + "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", + "model input names"); DEFINE_string(model_output_names, - "save_infer_model/scale_0.tmp_1,save_infer_model/" - "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/" - "scale_3.tmp_1", + "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", "model output names"); DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names"); @@ -50,9 +51,13 @@ int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + CHECK(FLAGS_result_wspecifier != ""); + CHECK(FLAGS_feature_rspecifier != ""); + kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + std::string model_graph = FLAGS_model_path; std::string model_params = FLAGS_param_path; std::string dict_file = FLAGS_dict_file; @@ -73,6 +78,7 @@ int main(int argc, char* argv[]) { model_opts.model_path = model_graph; model_opts.params_path = model_params; model_opts.cache_shape = FLAGS_model_cache_names; + model_opts.input_names = FLAGS_model_input_names; model_opts.output_names = FLAGS_model_output_names; std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); diff --git a/speechx/examples/decoder/local/model.sh b/speechx/examples/ds2_ol/decoder/local/model.sh similarity index 100% rename from speechx/examples/decoder/local/model.sh rename to speechx/examples/ds2_ol/decoder/local/model.sh diff --git a/speechx/examples/decoder/decoder_test_main.cc b/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc similarity index 100% rename from speechx/examples/decoder/decoder_test_main.cc rename to speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc diff --git a/speechx/examples/aishell/path.sh b/speechx/examples/ds2_ol/decoder/path.sh similarity index 79% rename from speechx/examples/aishell/path.sh rename to speechx/examples/ds2_ol/decoder/path.sh index a0e7c9ae..8e26e6e7 100644 --- a/speechx/examples/aishell/path.sh +++ b/speechx/examples/ds2_ol/decoder/path.sh @@ -1,6 +1,6 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../.. +SPEECHX_ROOT=$PWD/../../../ SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples SPEECHX_TOOLS=$SPEECHX_ROOT/tools @@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C -SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat +SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN diff --git a/speechx/examples/ds2_ol/decoder/run.sh b/speechx/examples/ds2_ol/decoder/run.sh new file mode 100755 index 00000000..de76e979 --- /dev/null +++ b/speechx/examples/ds2_ol/decoder/run.sh @@ -0,0 +1,79 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# input +mkdir -p data +data=$PWD/data +ckpt_dir=$data/model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ +vocb_dir=$ckpt_dir/data/lang_char/ + +lm=$data/zh_giga.no_cna_cmn.prune01244.klm + +# output +exp_dir=./exp +mkdir -p $exp_dir + +# 2. download model +if [[ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + +# download lm +if [ ! -f $lm ]; then + pushd data + wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm + popd +fi + + +feat_wspecifier=$exp_dir/feats.ark +cmvn=$exp_dir/cmvn.ark + +export GLOG_logtostderr=1 + +# dump json cmvn to kaldi +cmvn-json2kaldi \ + --json_file $ckpt_dir/data/mean_std.json \ + --cmvn_write_path $exp_dir/cmvn.ark \ + --binary=false +echo "convert json cmvn to kaldi ark." + + +# generate linear feature as streaming +linear-spectrogram-wo-db-norm-ol \ + --wav_rspecifier=scp:$data/wav.scp \ + --feature_wspecifier=ark,t:$feat_wspecifier \ + --cmvn_file=$exp_dir/cmvn.ark +echo "compute linear spectrogram feature." + +# run ctc beam search decoder as streaming +ctc-prefix-beam-search-decoder-ol \ + --result_wspecifier=ark,t:$exp_dir/result.txt \ + --feature_rspecifier=ark:$feat_wspecifier \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdiparams \ + --dict_file=$vocb_dir/vocab.txt \ + --lm_path=$lm \ No newline at end of file diff --git a/speechx/examples/decoder/valgrind.sh b/speechx/examples/ds2_ol/decoder/valgrind.sh similarity index 100% rename from speechx/examples/decoder/valgrind.sh rename to speechx/examples/ds2_ol/decoder/valgrind.sh diff --git a/speechx/examples/decoder/offline_wfst_decoder_main.cc b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc similarity index 99% rename from speechx/examples/decoder/offline_wfst_decoder_main.cc rename to speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc index 06460a45..dc4030b6 100644 --- a/speechx/examples/decoder/offline_wfst_decoder_main.cc +++ b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc @@ -28,6 +28,7 @@ DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_string(graph_path, "TLG", "decoder graph"); + DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); DEFINE_int32(max_active, 7500, "decoder graph"); DEFINE_int32(receptive_field_length, diff --git a/speechx/examples/ds2_ol/feat/.gitignore b/speechx/examples/ds2_ol/feat/.gitignore new file mode 100644 index 00000000..566f2d97 --- /dev/null +++ b/speechx/examples/ds2_ol/feat/.gitignore @@ -0,0 +1,2 @@ +exp +data diff --git a/speechx/examples/ds2_ol/feat/CMakeLists.txt b/speechx/examples/ds2_ol/feat/CMakeLists.txt new file mode 100644 index 00000000..90cb9c8e --- /dev/null +++ b/speechx/examples/ds2_ol/feat/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +set(bin_name linear-spectrogram-wo-db-norm-ol) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog) + + +set(bin_name cmvn-json2kaldi) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog) diff --git a/speechx/examples/ds2_ol/feat/README.md b/speechx/examples/ds2_ol/feat/README.md new file mode 100644 index 00000000..a2d206d8 --- /dev/null +++ b/speechx/examples/ds2_ol/feat/README.md @@ -0,0 +1,8 @@ +# Deepspeech2 Straming Audio Feature + +ASR audio feature test bins. We using theses bins to test linaer/fbank/mfcc asr feature as streaming manner. + +* linear_spectrogram_without_db_norm_main.cc + +compute linear spectrogram w/o db norm in streaming manner. + diff --git a/speechx/examples/feat/cmvn_json2binary_main.cc b/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc similarity index 72% rename from speechx/examples/feat/cmvn_json2binary_main.cc rename to speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc index e77f983a..e90b8ee4 100644 --- a/speechx/examples/feat/cmvn_json2binary_main.cc +++ b/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Note: Do not print/log ondemand object. + #include "base/flags.h" #include "base/log.h" #include "kaldi/matrix/kaldi-matrix.h" @@ -28,23 +30,30 @@ using namespace simdjson; int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + + LOG(INFO) << "cmvn josn path: " << FLAGS_json_file ; + padded_string json = padded_string::load(FLAGS_json_file); ondemand::parser parser; - padded_string json = padded_string::load(FLAGS_json_file); - ondemand::document val = parser.iterate(json); - ondemand::object doc = val; - kaldi::int32 frame_num = uint64_t(doc["frame_num"]); - auto mean_stat = doc["mean_stat"]; + ondemand::document doc = parser.iterate(json); + ondemand::value val = doc; + + ondemand::array mean_stat = val["mean_stat"]; std::vector mean_stat_vec; for (double x : mean_stat) { mean_stat_vec.push_back(x); } - auto var_stat = doc["var_stat"]; + // LOG(INFO) << mean_stat; this line will casue simdjson::simdjson_error("Objects and arrays can only be iterated when they are first encountered") + + ondemand::array var_stat = val["var_stat"]; std::vector var_stat_vec; for (double x : var_stat) { var_stat_vec.push_back(x); } + kaldi::int32 frame_num = uint64_t(val["frame_num"]); + LOG(INFO) << "nframe: " << frame_num; + size_t mean_size = mean_stat_vec.size(); kaldi::Matrix cmvn_stats(2, mean_size + 1); for (size_t idx = 0; idx < mean_size; ++idx) { @@ -52,7 +61,10 @@ int main(int argc, char* argv[]) { cmvn_stats(1, idx) = var_stat_vec[idx]; } cmvn_stats(0, mean_size) = frame_num; + LOG(INFO) << cmvn_stats; + kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary); - LOG(INFO) << "the json file have write into " << FLAGS_cmvn_write_path; + LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path; + LOG(INFO) << "Binary: " << FLAGS_binary; return 0; } \ No newline at end of file diff --git a/speechx/examples/feat/linear_spectrogram_without_db_norm_main.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc similarity index 99% rename from speechx/examples/feat/linear_spectrogram_without_db_norm_main.cc rename to speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index 5b875a3e..27ca6f9f 100644 --- a/speechx/examples/feat/linear_spectrogram_without_db_norm_main.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -32,6 +32,7 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); + int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); diff --git a/speechx/examples/feat/path.sh b/speechx/examples/ds2_ol/feat/path.sh similarity index 85% rename from speechx/examples/feat/path.sh rename to speechx/examples/ds2_ol/feat/path.sh index 8ab7ee29..ad2b6a4e 100644 --- a/speechx/examples/feat/path.sh +++ b/speechx/examples/ds2_ol/feat/path.sh @@ -1,6 +1,6 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../.. +SPEECHX_ROOT=$PWD/../../../ SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples SPEECHX_TOOLS=$SPEECHX_ROOT/tools @@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C -SPEECHX_BIN=$SPEECHX_EXAMPLES/feat +SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/feat export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN diff --git a/speechx/examples/ds2_ol/feat/run.sh b/speechx/examples/ds2_ol/feat/run.sh new file mode 100755 index 00000000..75777927 --- /dev/null +++ b/speechx/examples/ds2_ol/feat/run.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set +x +set -e + +. ./path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# 2. download model +if [ ! -e data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + mkdir -p data + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + + +# input +data_dir=./data +exp_dir=./exp +model_dir=$data_dir/model/ + +mkdir -p $exp_dir + + +# 3. run feat +export GLOG_logtostderr=1 + +cmvn-json2kaldi \ + --json_file $model_dir/data/mean_std.json \ + --cmvn_write_path $exp_dir/cmvn.ark \ + --binary=false +echo "convert json cmvn to kaldi ark." + + +linear-spectrogram-wo-db-norm-ol \ + --wav_rspecifier=scp:$data_dir/wav.scp \ + --feature_wspecifier=ark,t:$exp_dir/feats.ark \ + --cmvn_file=$exp_dir/cmvn.ark +echo "compute linear spectrogram feature." + + diff --git a/speechx/examples/feat/valgrind.sh b/speechx/examples/ds2_ol/feat/valgrind.sh similarity index 100% rename from speechx/examples/feat/valgrind.sh rename to speechx/examples/ds2_ol/feat/valgrind.sh diff --git a/speechx/examples/ds2_ol/nnet/.gitignore b/speechx/examples/ds2_ol/nnet/.gitignore new file mode 100644 index 00000000..bbd86a25 --- /dev/null +++ b/speechx/examples/ds2_ol/nnet/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/ds2_ol/nnet/CMakeLists.txt b/speechx/examples/ds2_ol/nnet/CMakeLists.txt new file mode 100644 index 00000000..6745a51a --- /dev/null +++ b/speechx/examples/ds2_ol/nnet/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +set(bin_name ds2-model-ol-test) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} PUBLIC nnet gflags glog ${DEPS}) \ No newline at end of file diff --git a/speechx/examples/ds2_ol/nnet/README.md b/speechx/examples/ds2_ol/nnet/README.md new file mode 100644 index 00000000..2bca0745 --- /dev/null +++ b/speechx/examples/ds2_ol/nnet/README.md @@ -0,0 +1,3 @@ +# Deepspeech2 Streaming NNet Test + +Using for ds2 streaming nnet inference test. \ No newline at end of file diff --git a/speechx/examples/nnet/pp-model-test.cc b/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc similarity index 90% rename from speechx/examples/nnet/pp-model-test.cc rename to speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc index 2db354a7..752b6dcf 100644 --- a/speechx/examples/nnet/pp-model-test.cc +++ b/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +// deepspeech2 online model info + +#include "base/flags.h" +#include "base/log.h" #include #include #include @@ -25,16 +28,19 @@ using std::cout; using std::endl; -DEFINE_string(model_path, "avg_1.jit.pdmodel", "xxx.pdmodel"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "xxx.pdiparams"); + +DEFINE_string(model_path, "", "xxx.pdmodel"); +DEFINE_string(param_path, "", "xxx.pdiparams"); +DEFINE_int32(chunk_size, 35, "feature chunk size, unit:frame"); +DEFINE_int32(feat_dim, 161, "feature dim"); void produce_data(std::vector>* data); void model_forward_test(); void produce_data(std::vector>* data) { - int chunk_size = 35; // chunk_size in frame - int col_size = 161; // feat dim + int chunk_size = FLAGS_chunk_size; // chunk_size in frame + int col_size = FLAGS_feat_dim; // feat dim cout << "chunk size: " << chunk_size << endl; cout << "feat dim: " << col_size << endl; @@ -57,6 +63,8 @@ void model_forward_test() { ; std::string model_graph = FLAGS_model_path; std::string model_params = FLAGS_param_path; + CHECK(model_graph != ""); + CHECK(model_params != ""); cout << "model path: " << model_graph << endl; cout << "model param path : " << model_params << endl; @@ -106,7 +114,7 @@ void model_forward_test() { // state_h std::unique_ptr chunk_state_h_box = predictor->GetInputHandle(input_names[2]); - std::vector chunk_state_h_box_shape = {3, 1, 1024}; + std::vector chunk_state_h_box_shape = {5, 1, 1024}; chunk_state_h_box->Reshape(chunk_state_h_box_shape); int chunk_state_h_box_size = std::accumulate(chunk_state_h_box_shape.begin(), @@ -119,7 +127,7 @@ void model_forward_test() { // state_c std::unique_ptr chunk_state_c_box = predictor->GetInputHandle(input_names[3]); - std::vector chunk_state_c_box_shape = {3, 1, 1024}; + std::vector chunk_state_c_box_shape = {5, 1, 1024}; chunk_state_c_box->Reshape(chunk_state_c_box_shape); int chunk_state_c_box_size = std::accumulate(chunk_state_c_box_shape.begin(), @@ -187,7 +195,9 @@ void model_forward_test() { } int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + model_forward_test(); return 0; } diff --git a/speechx/examples/glog/path.sh b/speechx/examples/ds2_ol/nnet/path.sh similarity index 85% rename from speechx/examples/glog/path.sh rename to speechx/examples/ds2_ol/nnet/path.sh index e2c7b2fc..0ee8b478 100644 --- a/speechx/examples/glog/path.sh +++ b/speechx/examples/ds2_ol/nnet/path.sh @@ -1,6 +1,6 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../.. +SPEECHX_ROOT=$PWD/../../../ SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples SPEECHX_TOOLS=$SPEECHX_ROOT/tools @@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C -SPEECHX_BIN=$SPEECHX_EXAMPLES/glog +SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/nnet export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN diff --git a/speechx/examples/ds2_ol/nnet/run.sh b/speechx/examples/ds2_ol/nnet/run.sh new file mode 100755 index 00000000..10029f7e --- /dev/null +++ b/speechx/examples/ds2_ol/nnet/run.sh @@ -0,0 +1,38 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# 2. download model +if [ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + mkdir -p data + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + +ckpt_dir=./data/model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ + +ds2-model-ol-test \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdiparams + diff --git a/speechx/examples/nnet/valgrind.sh b/speechx/examples/ds2_ol/nnet/valgrind.sh similarity index 100% rename from speechx/examples/nnet/valgrind.sh rename to speechx/examples/ds2_ol/nnet/valgrind.sh diff --git a/speechx/examples/feat/CMakeLists.txt b/speechx/examples/feat/CMakeLists.txt deleted file mode 100644 index d6fdb9bc..00000000 --- a/speechx/examples/feat/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - - -add_executable(mfcc-test ${CMAKE_CURRENT_SOURCE_DIR}/feature-mfcc-test.cc) -target_include_directories(mfcc-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(mfcc-test kaldi-mfcc) - -add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc) -target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog) - -add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc) -target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog) - -add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc) -target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog) diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc deleted file mode 100644 index 48a9e1c2..00000000 --- a/speechx/examples/feat/feature-mfcc-test.cc +++ /dev/null @@ -1,719 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// feat/feature-mfcc-test.cc - -// Copyright 2009-2011 Karel Vesely; Petr Motlicek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include - -#include "base/kaldi-math.h" -#include "feat/feature-mfcc.h" -#include "feat/wave-reader.h" -#include "matrix/kaldi-matrix-inl.h" - -using namespace kaldi; - -static void UnitTestReadWave() { - std::cout << "=== UnitTestReadWave() ===\n"; - - Vector v, v2; - - std::cout << "<<<=== Reading waveform\n"; - - { - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - const Matrix data(wave.Data()); - KALDI_ASSERT(data.NumRows() == 1); - v.Resize(data.NumCols()); - v.CopyFromVec(data.Row(0)); - } - - std::cout - << "<<<=== Reading Vector waveform, prepared by matlab\n"; - std::ifstream input("test_data/test_matlab.ascii"); - KALDI_ASSERT(input.good()); - v2.Read(input, false); - input.close(); - - std::cout - << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n"; - KALDI_ASSERT(v.Dim() == v2.Dim()); - for (int32 i = 0; i < v.Dim(); i++) { - KALDI_ASSERT(v(i) == v2(i)); - } - std::cout << "<<<=== Comparing done\n"; - - // std::cout << "== The Waveform Samples == \n"; - // std::cout << v; - - std::cout << "Test passed :)\n\n"; -} - - -/** - */ -static void UnitTestSimple() { - std::cout << "=== UnitTestSimple() ===\n"; - - Vector v(100000); - Matrix m; - - // init with noise - for (int32 i = 0; i < v.Dim(); i++) { - v(i) = (abs(i * 433024253) % 65535) - (65535 / 2); - } - - std::cout << "<<<=== Just make sure it runs... Nothing is compared\n"; - // the parametrization object - MfccOptions op; - // trying to have same opts as baseline. - op.frame_opts.dither = 0.0; - op.frame_opts.preemph_coeff = 0.0; - op.frame_opts.window_type = "rectangular"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.mel_opts.low_freq = 0.0; - op.mel_opts.htk_mode = true; - op.htk_compat = true; - - Mfcc mfcc(op); - // use default parameters - - // compute mfccs. - mfcc.Compute(v, 1.0, &m); - - // possibly dump - // std::cout << "== Output features == \n" << m; - std::cout << "Test passed :)\n\n"; -} - - -static void UnitTestHTKCompare1() { - std::cout << "=== UnitTestHTKCompare1() ===\n"; - - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - KALDI_ASSERT(wave.Data().NumRows() == 1); - SubVector waveform(wave.Data(), 0); - - // read the HTK features - Matrix htk_features; - { - std::ifstream is("test_data/test.wav.fea_htk.1", - std::ios::in | std::ios_base::binary); - bool ans = ReadHtk(is, &htk_features, 0); - KALDI_ASSERT(ans); - } - - // use mfcc with default configuration... - MfccOptions op; - op.frame_opts.dither = 0.0; - op.frame_opts.preemph_coeff = 0.0; - op.frame_opts.window_type = "hamming"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.mel_opts.low_freq = 0.0; - op.mel_opts.htk_mode = true; - op.htk_compat = true; - op.use_energy = false; // C0 not energy. - - Mfcc mfcc(op); - - // calculate kaldi features - Matrix kaldi_raw_features; - mfcc.Compute(waveform, 1.0, &kaldi_raw_features); - - DeltaFeaturesOptions delta_opts; - Matrix kaldi_features; - ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features); - - // compare the results - bool passed = true; - int32 i_old = -1; - KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); - KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); - // Ignore ends-- we make slightly different choices than - // HTK about how to treat the deltas at the ends. - for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) { - for (int32 j = 0; j < kaldi_features.NumCols(); j++) { - BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); - if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! - // print the non-matching data only once per-line - if (i_old != i) { - std::cout << "\n\n\n[HTK-row: " << i << "] " - << htk_features.Row(i) << "\n"; - std::cout << "[Kaldi-row: " << i << "] " - << kaldi_features.Row(i) << "\n\n\n"; - i_old = i; - } - // print indices of non-matching cells - std::cout << "[" << i << ", " << j << "]"; - passed = false; - } - } - } - if (!passed) KALDI_ERR << "Test failed"; - - // write the htk features for later inspection - HtkHeader header = { - kaldi_features.NumRows(), - 100000, // 10ms - static_cast(sizeof(float) * kaldi_features.NumCols()), - 021406 // MFCC_D_A_0 - }; - { - std::ofstream os("tmp.test.wav.fea_kaldi.1", - std::ios::out | std::ios::binary); - WriteHtk(os, kaldi_features, header); - } - - std::cout << "Test passed :)\n\n"; - - unlink("tmp.test.wav.fea_kaldi.1"); -} - - -static void UnitTestHTKCompare2() { - std::cout << "=== UnitTestHTKCompare2() ===\n"; - - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - KALDI_ASSERT(wave.Data().NumRows() == 1); - SubVector waveform(wave.Data(), 0); - - // read the HTK features - Matrix htk_features; - { - std::ifstream is("test_data/test.wav.fea_htk.2", - std::ios::in | std::ios_base::binary); - bool ans = ReadHtk(is, &htk_features, 0); - KALDI_ASSERT(ans); - } - - // use mfcc with default configuration... - MfccOptions op; - op.frame_opts.dither = 0.0; - op.frame_opts.preemph_coeff = 0.0; - op.frame_opts.window_type = "hamming"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.mel_opts.low_freq = 0.0; - op.mel_opts.htk_mode = true; - op.htk_compat = true; - op.use_energy = true; // Use energy. - - Mfcc mfcc(op); - - // calculate kaldi features - Matrix kaldi_raw_features; - mfcc.Compute(waveform, 1.0, &kaldi_raw_features); - - DeltaFeaturesOptions delta_opts; - Matrix kaldi_features; - ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features); - - // compare the results - bool passed = true; - int32 i_old = -1; - KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); - KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); - // Ignore ends-- we make slightly different choices than - // HTK about how to treat the deltas at the ends. - for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) { - for (int32 j = 0; j < kaldi_features.NumCols(); j++) { - BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); - if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! - // print the non-matching data only once per-line - if (i_old != i) { - std::cout << "\n\n\n[HTK-row: " << i << "] " - << htk_features.Row(i) << "\n"; - std::cout << "[Kaldi-row: " << i << "] " - << kaldi_features.Row(i) << "\n\n\n"; - i_old = i; - } - // print indices of non-matching cells - std::cout << "[" << i << ", " << j << "]"; - passed = false; - } - } - } - if (!passed) KALDI_ERR << "Test failed"; - - // write the htk features for later inspection - HtkHeader header = { - kaldi_features.NumRows(), - 100000, // 10ms - static_cast(sizeof(float) * kaldi_features.NumCols()), - 021406 // MFCC_D_A_0 - }; - { - std::ofstream os("tmp.test.wav.fea_kaldi.2", - std::ios::out | std::ios::binary); - WriteHtk(os, kaldi_features, header); - } - - std::cout << "Test passed :)\n\n"; - - unlink("tmp.test.wav.fea_kaldi.2"); -} - - -static void UnitTestHTKCompare3() { - std::cout << "=== UnitTestHTKCompare3() ===\n"; - - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - KALDI_ASSERT(wave.Data().NumRows() == 1); - SubVector waveform(wave.Data(), 0); - - // read the HTK features - Matrix htk_features; - { - std::ifstream is("test_data/test.wav.fea_htk.3", - std::ios::in | std::ios_base::binary); - bool ans = ReadHtk(is, &htk_features, 0); - KALDI_ASSERT(ans); - } - - // use mfcc with default configuration... - MfccOptions op; - op.frame_opts.dither = 0.0; - op.frame_opts.preemph_coeff = 0.0; - op.frame_opts.window_type = "hamming"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.htk_compat = true; - op.use_energy = true; // Use energy. - op.mel_opts.low_freq = 20.0; - // op.mel_opts.debug_mel = true; - op.mel_opts.htk_mode = true; - - Mfcc mfcc(op); - - // calculate kaldi features - Matrix kaldi_raw_features; - mfcc.Compute(waveform, 1.0, &kaldi_raw_features); - - DeltaFeaturesOptions delta_opts; - Matrix kaldi_features; - ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features); - - // compare the results - bool passed = true; - int32 i_old = -1; - KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); - KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); - // Ignore ends-- we make slightly different choices than - // HTK about how to treat the deltas at the ends. - for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) { - for (int32 j = 0; j < kaldi_features.NumCols(); j++) { - BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); - if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! - // print the non-matching data only once per-line - if (static_cast(i_old) != i) { - std::cout << "\n\n\n[HTK-row: " << i << "] " - << htk_features.Row(i) << "\n"; - std::cout << "[Kaldi-row: " << i << "] " - << kaldi_features.Row(i) << "\n\n\n"; - i_old = i; - } - // print indices of non-matching cells - std::cout << "[" << i << ", " << j << "]"; - passed = false; - } - } - } - if (!passed) KALDI_ERR << "Test failed"; - - // write the htk features for later inspection - HtkHeader header = { - kaldi_features.NumRows(), - 100000, // 10ms - static_cast(sizeof(float) * kaldi_features.NumCols()), - 021406 // MFCC_D_A_0 - }; - { - std::ofstream os("tmp.test.wav.fea_kaldi.3", - std::ios::out | std::ios::binary); - WriteHtk(os, kaldi_features, header); - } - - std::cout << "Test passed :)\n\n"; - - unlink("tmp.test.wav.fea_kaldi.3"); -} - - -static void UnitTestHTKCompare4() { - std::cout << "=== UnitTestHTKCompare4() ===\n"; - - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - KALDI_ASSERT(wave.Data().NumRows() == 1); - SubVector waveform(wave.Data(), 0); - - // read the HTK features - Matrix htk_features; - { - std::ifstream is("test_data/test.wav.fea_htk.4", - std::ios::in | std::ios_base::binary); - bool ans = ReadHtk(is, &htk_features, 0); - KALDI_ASSERT(ans); - } - - // use mfcc with default configuration... - MfccOptions op; - op.frame_opts.dither = 0.0; - op.frame_opts.window_type = "hamming"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.mel_opts.low_freq = 0.0; - op.htk_compat = true; - op.use_energy = true; // Use energy. - op.mel_opts.htk_mode = true; - - Mfcc mfcc(op); - - // calculate kaldi features - Matrix kaldi_raw_features; - mfcc.Compute(waveform, 1.0, &kaldi_raw_features); - - DeltaFeaturesOptions delta_opts; - Matrix kaldi_features; - ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features); - - // compare the results - bool passed = true; - int32 i_old = -1; - KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); - KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); - // Ignore ends-- we make slightly different choices than - // HTK about how to treat the deltas at the ends. - for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) { - for (int32 j = 0; j < kaldi_features.NumCols(); j++) { - BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); - if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! - // print the non-matching data only once per-line - if (static_cast(i_old) != i) { - std::cout << "\n\n\n[HTK-row: " << i << "] " - << htk_features.Row(i) << "\n"; - std::cout << "[Kaldi-row: " << i << "] " - << kaldi_features.Row(i) << "\n\n\n"; - i_old = i; - } - // print indices of non-matching cells - std::cout << "[" << i << ", " << j << "]"; - passed = false; - } - } - } - if (!passed) KALDI_ERR << "Test failed"; - - // write the htk features for later inspection - HtkHeader header = { - kaldi_features.NumRows(), - 100000, // 10ms - static_cast(sizeof(float) * kaldi_features.NumCols()), - 021406 // MFCC_D_A_0 - }; - { - std::ofstream os("tmp.test.wav.fea_kaldi.4", - std::ios::out | std::ios::binary); - WriteHtk(os, kaldi_features, header); - } - - std::cout << "Test passed :)\n\n"; - - unlink("tmp.test.wav.fea_kaldi.4"); -} - - -static void UnitTestHTKCompare5() { - std::cout << "=== UnitTestHTKCompare5() ===\n"; - - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - KALDI_ASSERT(wave.Data().NumRows() == 1); - SubVector waveform(wave.Data(), 0); - - // read the HTK features - Matrix htk_features; - { - std::ifstream is("test_data/test.wav.fea_htk.5", - std::ios::in | std::ios_base::binary); - bool ans = ReadHtk(is, &htk_features, 0); - KALDI_ASSERT(ans); - } - - // use mfcc with default configuration... - MfccOptions op; - op.frame_opts.dither = 0.0; - op.frame_opts.window_type = "hamming"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.htk_compat = true; - op.use_energy = true; // Use energy. - op.mel_opts.low_freq = 0.0; - op.mel_opts.vtln_low = 100.0; - op.mel_opts.vtln_high = 7500.0; - op.mel_opts.htk_mode = true; - - BaseFloat vtln_warp = - 1.1; // our approach identical to htk for warp factor >1, - // differs slightly for higher mel bins if warp_factor <0.9 - - Mfcc mfcc(op); - - // calculate kaldi features - Matrix kaldi_raw_features; - mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features); - - DeltaFeaturesOptions delta_opts; - Matrix kaldi_features; - ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features); - - // compare the results - bool passed = true; - int32 i_old = -1; - KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); - KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); - // Ignore ends-- we make slightly different choices than - // HTK about how to treat the deltas at the ends. - for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) { - for (int32 j = 0; j < kaldi_features.NumCols(); j++) { - BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); - if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! - // print the non-matching data only once per-line - if (static_cast(i_old) != i) { - std::cout << "\n\n\n[HTK-row: " << i << "] " - << htk_features.Row(i) << "\n"; - std::cout << "[Kaldi-row: " << i << "] " - << kaldi_features.Row(i) << "\n\n\n"; - i_old = i; - } - // print indices of non-matching cells - std::cout << "[" << i << ", " << j << "]"; - passed = false; - } - } - } - if (!passed) KALDI_ERR << "Test failed"; - - // write the htk features for later inspection - HtkHeader header = { - kaldi_features.NumRows(), - 100000, // 10ms - static_cast(sizeof(float) * kaldi_features.NumCols()), - 021406 // MFCC_D_A_0 - }; - { - std::ofstream os("tmp.test.wav.fea_kaldi.5", - std::ios::out | std::ios::binary); - WriteHtk(os, kaldi_features, header); - } - - std::cout << "Test passed :)\n\n"; - - unlink("tmp.test.wav.fea_kaldi.5"); -} - -static void UnitTestHTKCompare6() { - std::cout << "=== UnitTestHTKCompare6() ===\n"; - - - std::ifstream is("test_data/test.wav", std::ios_base::binary); - WaveData wave; - wave.Read(is); - KALDI_ASSERT(wave.Data().NumRows() == 1); - SubVector waveform(wave.Data(), 0); - - // read the HTK features - Matrix htk_features; - { - std::ifstream is("test_data/test.wav.fea_htk.6", - std::ios::in | std::ios_base::binary); - bool ans = ReadHtk(is, &htk_features, 0); - KALDI_ASSERT(ans); - } - - // use mfcc with default configuration... - MfccOptions op; - op.frame_opts.dither = 0.0; - op.frame_opts.preemph_coeff = 0.97; - op.frame_opts.window_type = "hamming"; - op.frame_opts.remove_dc_offset = false; - op.frame_opts.round_to_power_of_two = true; - op.mel_opts.num_bins = 24; - op.mel_opts.low_freq = 125.0; - op.mel_opts.high_freq = 7800.0; - op.htk_compat = true; - op.use_energy = false; // C0 not energy. - - Mfcc mfcc(op); - - // calculate kaldi features - Matrix kaldi_raw_features; - mfcc.Compute(waveform, 1.0, &kaldi_raw_features); - - DeltaFeaturesOptions delta_opts; - Matrix kaldi_features; - ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features); - - // compare the results - bool passed = true; - int32 i_old = -1; - KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); - KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); - // Ignore ends-- we make slightly different choices than - // HTK about how to treat the deltas at the ends. - for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) { - for (int32 j = 0; j < kaldi_features.NumCols(); j++) { - BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); - if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! - // print the non-matching data only once per-line - if (static_cast(i_old) != i) { - std::cout << "\n\n\n[HTK-row: " << i << "] " - << htk_features.Row(i) << "\n"; - std::cout << "[Kaldi-row: " << i << "] " - << kaldi_features.Row(i) << "\n\n\n"; - i_old = i; - } - // print indices of non-matching cells - std::cout << "[" << i << ", " << j << "]"; - passed = false; - } - } - } - if (!passed) KALDI_ERR << "Test failed"; - - // write the htk features for later inspection - HtkHeader header = { - kaldi_features.NumRows(), - 100000, // 10ms - static_cast(sizeof(float) * kaldi_features.NumCols()), - 021406 // MFCC_D_A_0 - }; - { - std::ofstream os("tmp.test.wav.fea_kaldi.6", - std::ios::out | std::ios::binary); - WriteHtk(os, kaldi_features, header); - } - - std::cout << "Test passed :)\n\n"; - - unlink("tmp.test.wav.fea_kaldi.6"); -} - -void UnitTestVtln() { - // Test the function VtlnWarpFreq. - BaseFloat low_freq = 10, high_freq = 7800, vtln_low_cutoff = 20, - vtln_high_cutoff = 7400; - - for (size_t i = 0; i < 100; i++) { - BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2; - AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, - vtln_high_cutoff, - low_freq, - high_freq, - warp_factor, - freq), - freq / warp_factor); - - AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, - vtln_high_cutoff, - low_freq, - high_freq, - warp_factor, - low_freq), - low_freq); - AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, - vtln_high_cutoff, - low_freq, - high_freq, - warp_factor, - high_freq), - high_freq); - BaseFloat freq2 = low_freq + (high_freq - low_freq) * RandUniform(), - freq3 = freq2 + - (high_freq - freq2) * RandUniform(); // freq3>=freq2 - BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, - vtln_high_cutoff, - low_freq, - high_freq, - warp_factor, - freq2); - BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, - vtln_high_cutoff, - low_freq, - high_freq, - warp_factor, - freq3); - KALDI_ASSERT(w3 >= w2); // increasing function. - BaseFloat w3dash = MelBanks::VtlnWarpFreq( - vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, 1.0, freq3); - AssertEqual(w3dash, freq3); - } -} - -static void UnitTestFeat() { - UnitTestVtln(); - UnitTestReadWave(); - UnitTestSimple(); - UnitTestHTKCompare1(); - UnitTestHTKCompare2(); - // commenting out this one as it doesn't compare right now I normalized - // the way the FFT bins are treated (removed offset of 0.5)... this seems - // to relate to the way frequency zero behaves. - UnitTestHTKCompare3(); - UnitTestHTKCompare4(); - UnitTestHTKCompare5(); - UnitTestHTKCompare6(); - std::cout << "Tests succeeded.\n"; -} - - -int main() { - try { - for (int i = 0; i < 5; i++) UnitTestFeat(); - std::cout << "Tests succeeded.\n"; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return 1; - } -} diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc deleted file mode 100644 index 2e70386d..00000000 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ /dev/null @@ -1,270 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// todo refactor, repalce with gtest - -#include "base/flags.h" -#include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - -#include "frontend/audio/audio_cache.h" -#include "frontend/audio/data_cache.h" -#include "frontend/audio/feature_cache.h" -#include "frontend/audio/frontend_itf.h" -#include "frontend/audio/linear_spectrogram.h" -#include "frontend/audio/normalizer.h" - -DEFINE_string(wav_rspecifier, "", "test wav scp path"); -DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); -DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); -DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); - - -std::vector mean_{ - -13730251.531853663, -12982852.199316509, -13673844.299583456, - -13089406.559646806, -12673095.524938712, -12823859.223276224, - -13590267.158903603, -14257618.467152044, -14374605.116185192, - -14490009.21822485, -14849827.158924166, -15354435.470563512, - -15834149.206532761, -16172971.985514281, -16348740.496746974, - -16423536.699409386, -16556246.263649225, -16744088.772748645, - -16916184.08510357, -17054034.840031497, -17165612.509455364, - -17255955.470915023, -17322572.527648456, -17408943.862033736, - -17521554.799865916, -17620623.254924215, -17699792.395918526, - -17723364.411134344, -17741483.4433254, -17747426.888704527, - -17733315.928209435, -17748780.160905756, -17808336.883775543, - -17895918.671983004, -18009812.59173023, -18098188.66548325, - -18195798.958462656, -18293617.62980999, -18397432.92077201, - -18505834.787318766, -18585451.8100908, -18652438.235649142, - -18700960.306275308, -18734944.58792185, -18737426.313365128, - -18735347.165987637, -18738813.444170244, -18737086.848890636, - -18731576.2474336, -18717405.44095871, -18703089.25545657, - -18691014.546456724, -18692460.568905357, -18702119.628629155, - -18727710.621126678, -18761582.72034647, -18806745.835547544, - -18850674.8692112, -18884431.510951452, -18919999.992506847, - -18939303.799078144, -18952946.273760635, -18980289.22996379, - -19011610.17803294, -19040948.61805145, -19061021.429847397, - -19112055.53768819, -19149667.414264943, -19201127.05091321, - -19270250.82564605, -19334606.883057203, -19390513.336589377, - -19444176.259208687, -19502755.000038862, -19544333.014549147, - -19612668.183176614, -19681902.19006569, -19771969.951249883, - -19873329.723376893, -19996752.59235844, -20110031.131400537, - -20231658.612529557, -20319378.894054495, -20378534.45718066, - -20413332.089584175, -20438147.844177883, -20443710.248040095, - -20465457.02238927, -20488610.969337028, -20516295.16424432, - -20541423.795738827, -20553192.874953747, -20573605.50701977, - -20577871.61936797, -20571807.008916274, -20556242.38912231, - -20542199.30819195, -20521239.063551214, -20519150.80004532, - -20527204.80248933, -20536933.769257784, -20543470.522332076, - -20549700.089992985, -20551525.24958494, -20554873.406493705, - -20564277.65794227, -20572211.740052115, -20574305.69550465, - -20575494.450104576, -20567092.577932164, -20549302.929608088, - -20545445.11878376, -20546625.326603737, -20549190.03499401, - -20554824.947828256, -20568341.378989458, -20577582.331383612, - -20577980.519402675, -20566603.03458152, -20560131.592262644, - -20552166.469060015, -20549063.06763577, -20544490.562339947, - -20539817.82346569, -20528747.715731595, -20518026.24576161, - -20510977.844974525, -20506874.36087992, -20506731.11977665, - -20510482.133420516, -20507760.92101862, -20494644.834457114, - -20480107.89304893, -20461312.091867123, -20442941.75080173, - -20426123.02834838, -20424607.675283, -20426810.369107097, - -20434024.50097819, -20437404.75544205, -20447688.63916367, - -20460893.335563846, -20482922.735127095, -20503610.119434915, - -20527062.76448319, -20557830.035128627, -20593274.72068722, - -20632528.452965066, -20673637.471334763, -20733106.97143075, - -20842921.0447562, -21054357.83621519, -21416569.534189366, - -21978460.272811692, -22753170.052172784, -23671344.10563395, - -24613499.293358143, -25406477.12230188, -25884377.82156489, - -26049040.62791664, -26996879.104431007}; -std::vector variance_{ - 213747175.10846674, 188395815.34302503, 212706429.10966414, - 199109025.81461075, 189235901.23864496, 194901336.53253657, - 217481594.29306737, 238689869.12327808, 243977501.24115244, - 248479623.6431067, 259766741.47116545, 275516766.7790273, - 291271202.3691234, 302693239.8220509, 308627358.3997694, - 311143911.38788426, 315446105.07731867, 321705430.9341829, - 327458907.4659941, 332245072.43223983, 336251717.5935284, - 339694069.7639722, 342188204.4322228, 345587110.31313115, - 349903086.2875232, 353660214.20643026, 356700344.5270885, - 357665362.3529641, 358493352.05658793, 358857951.620328, - 358375239.52774596, 358899733.6342954, 361051818.3511561, - 364361716.05025816, 368750322.3771452, 372047800.6462831, - 375655861.1349018, 379358519.1980013, 383327605.3935181, - 387458599.282341, 390434692.3406868, 392994486.35057056, - 394874418.04603153, 396230525.79763395, 396365592.0414835, - 396334819.8242737, 396488353.19250053, 396438877.00744957, - 396197980.4459586, 395590921.6672991, 395001107.62072515, - 394528291.7318225, 394593110.424006, 395018405.59353715, - 396110577.5415993, 397506704.0371068, 399400197.4657644, - 401243568.2468382, 402687134.7805103, 404136047.2872507, - 404883170.001883, 405522253.219517, 406660365.3626476, - 407919346.0991902, 409045348.5384909, 409759588.7889818, - 411974821.8564483, 413489718.78201455, 415535392.56684107, - 418466481.97674364, 421104678.35678065, 423405392.5200779, - 425550570.40798235, 427929423.9579701, 429585274.253478, - 432368493.55181056, 435193587.13513297, 438886855.20476013, - 443058876.8633751, 448181232.5093362, 452883835.6332396, - 458056721.77926534, 461816531.22735566, 464363620.1970998, - 465886343.5057493, 466928872.0651, 467180536.42647296, - 468111848.70714295, 469138695.3071312, 470378429.6930793, - 471517958.7132626, 472109050.4262365, 473087417.0177867, - 473381322.04648733, 473220195.85483915, 472666071.8998819, - 472124669.87879956, 471298571.411737, 471251033.2902761, - 471672676.43128747, 472177147.2193172, 472572361.7711908, - 472968783.7751127, 473156295.4164052, 473398034.82676554, - 473897703.5203811, 474328271.33112127, 474452670.98002136, - 474549003.99284613, 474252887.13567275, 473557462.909069, - 473483385.85193115, 473609738.04855174, 473746944.82085115, - 474016729.91696435, 474617321.94138587, 475045097.237122, - 475125402.586558, 474664112.9824912, 474426247.5800283, - 474104075.42796475, 473978219.7273978, 473773171.7798875, - 473578534.69508696, 473102924.16904145, 472651240.5232615, - 472374383.1810912, 472209479.6956096, 472202298.8921673, - 472370090.76781124, 472220933.99374026, 471625467.37106377, - 470994646.51883453, 470182428.9637543, 469348211.5939578, - 468570387.4467277, 468540442.7225135, 468672018.90414184, - 468994346.9533251, 469138757.58201426, 469553915.95710236, - 470134523.38582784, 471082421.62055486, 471962316.51804745, - 472939745.1708408, 474250621.5944825, 475773933.43199486, - 477465399.71087736, 479218782.61382693, 481752299.7930922, - 486608947.8984568, 496119403.2067917, 512730085.5704984, - 539048915.2641417, 576285298.3548826, 621610270.2240586, - 669308196.4436442, 710656993.5957186, 736344437.3725077, - 745481288.0241544, 801121432.9925804}; -int count_ = 912592; - -void WriteMatrix() { - kaldi::Matrix cmvn_stats(2, mean_.size() + 1); - for (size_t idx = 0; idx < mean_.size(); ++idx) { - cmvn_stats(0, idx) = mean_[idx]; - cmvn_stats(1, idx) = variance_[idx]; - } - cmvn_stats(0, mean_.size()) = count_; - kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false); -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - kaldi::SequentialTableReader wav_reader( - FLAGS_wav_rspecifier); - kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); - WriteMatrix(); - - - int32 num_done = 0, num_err = 0; - - // feature pipeline: wave cache --> decibel_normalizer --> hanning - // window -->linear_spectrogram --> global cmvn -> feat cache - - // std::unique_ptr data_source(new - // ppspeech::DataCache()); - std::unique_ptr data_source( - new ppspeech::AudioCache()); - - ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr db_norm( - new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); - - ppspeech::LinearSpectrogramOptions opt; - opt.frame_opts.frame_length_ms = 20; - opt.frame_opts.frame_shift_ms = 10; - opt.streaming_chunk = FLAGS_streaming_chunk; - opt.frame_opts.dither = 0.0; - opt.frame_opts.remove_dc_offset = false; - opt.frame_opts.window_type = "hanning"; - opt.frame_opts.preemph_coeff = 0.0; - LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; - LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; - - std::unique_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); - - std::unique_ptr cmvn(new ppspeech::CMVN( - FLAGS_cmvn_write_path, std::move(linear_spectrogram))); - - ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); - LOG(INFO) << "feat dim: " << feature_cache.Dim(); - - int sample_rate = 16000; - float streaming_chunk = FLAGS_streaming_chunk; - int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; - LOG(INFO) << "chunk size (s): " << streaming_chunk; - LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - - - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); - const kaldi::WaveData& wave_data = wav_reader.Value(); - LOG(INFO) << "process utt: " << utt; - - int32 this_channel = 0; - kaldi::SubVector waveform(wave_data.Data(), - this_channel); - int tot_samples = waveform.Dim(); - LOG(INFO) << "wav len (sample): " << tot_samples; - - int sample_offset = 0; - std::vector> feats; - int feature_rows = 0; - while (sample_offset < tot_samples) { - int cur_chunk_size = - std::min(chunk_sample_size, tot_samples - sample_offset); - - kaldi::Vector wav_chunk(cur_chunk_size); - for (int i = 0; i < cur_chunk_size; ++i) { - wav_chunk(i) = waveform(sample_offset + i); - } - - kaldi::Vector features; - feature_cache.Accept(wav_chunk); - if (cur_chunk_size < chunk_sample_size) { - feature_cache.SetFinished(); - } - feature_cache.Read(&features); - if (features.Dim() == 0) break; - - feats.push_back(features); - sample_offset += cur_chunk_size; - feature_rows += features.Dim() / feature_cache.Dim(); - } - - int cur_idx = 0; - kaldi::Matrix features(feature_rows, - feature_cache.Dim()); - for (auto feat : feats) { - int num_rows = feat.Dim() / feature_cache.Dim(); - for (int row_idx = 0; row_idx < num_rows; ++row_idx) { - for (size_t col_idx = 0; col_idx < feature_cache.Dim(); - ++col_idx) { - features(cur_idx, col_idx) = - feat(row_idx * feature_cache.Dim() + col_idx); - } - ++cur_idx; - } - } - feat_writer.Write(utt, features); - feature_cache.Reset(); - - if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; - num_done++; - } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err - << " with errors."; - return (num_done != 0 ? 0 : 1); -} diff --git a/speechx/examples/feat/run.sh b/speechx/examples/feat/run.sh deleted file mode 100755 index 29c49d32..00000000 --- a/speechx/examples/feat/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -set +x -set -e - -. ./path.sh - -# 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then - pushd ${SPEECHX_ROOT} - bash build.sh - popd -fi - -# 2. download model -if [ ! -d ../paddle_asr_model ]; then - wget https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz - tar xzfv paddle_asr_model.tar.gz - mv ./paddle_asr_model ../ - # produce wav scp - echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp -fi - -model_dir=../paddle_asr_model -feat_wspecifier=./feats.ark -cmvn=./cmvn.ark - -# 3. run feat -export GLOG_logtostderr=1 -linear_spectrogram_main \ - --wav_rspecifier=scp:$model_dir/wav.scp \ - --feature_wspecifier=ark,t:$feat_wspecifier \ - --cmvn_write_path=$cmvn diff --git a/speechx/examples/ngram/.gitignore b/speechx/examples/ngram/.gitignore new file mode 100644 index 00000000..bbd86a25 --- /dev/null +++ b/speechx/examples/ngram/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/ngram/README.md b/speechx/examples/ngram/README.md new file mode 100644 index 00000000..c830663c --- /dev/null +++ b/speechx/examples/ngram/README.md @@ -0,0 +1,3 @@ +# NGram Train + + diff --git a/speechx/examples/nnet/CMakeLists.txt b/speechx/examples/nnet/CMakeLists.txt deleted file mode 100644 index 20f4008c..00000000 --- a/speechx/examples/nnet/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -add_executable(pp-model-test ${CMAKE_CURRENT_SOURCE_DIR}/pp-model-test.cc) -target_include_directories(pp-model-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(pp-model-test PUBLIC nnet gflags ${DEPS}) \ No newline at end of file diff --git a/speechx/examples/nnet/run.sh b/speechx/examples/nnet/run.sh deleted file mode 100755 index 4d67d198..00000000 --- a/speechx/examples/nnet/run.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set +x -set -e - -. path.sh - -# 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then - pushd ${SPEECHX_ROOT} - bash build.sh - popd -fi - -# 2. download model -if [ ! -d ../paddle_asr_model ]; then - wget https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz - tar xzfv paddle_asr_model.tar.gz - mv ./paddle_asr_model ../ - # produce wav scp - echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp -fi - -model_dir=../paddle_asr_model - -# 4. run decoder -pp-model-test \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --param_path=$model_dir/avg_1.jit.pdparams - diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h index 906994d0..9d954809 100644 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/paddle_nnet.h @@ -45,8 +45,7 @@ struct ModelOptions { thread_num(2), use_gpu(false), input_names( - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_" - "box"), + "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"), output_names( "save_infer_model/scale_0.tmp_1,save_infer_model/" "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"