[speechx] rm openblas && refactor kaldi-matrix, kaldi-vector (#2824)

* rm openblas && refactor kaldi-matrix kaldi-vector
pull/2854/head
YangZhou 2 years ago committed by GitHub
parent c1b1ae0515
commit ee7c266f13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -53,9 +53,6 @@ include(gflags)
include(glog)
#openblas
include(openblas)
# openfst
include(openfst)
add_dependencies(openfst gflags glog)

@ -14,7 +14,7 @@
#include "decoder/ctc_prefix_beam_search_decoder.h"
#include "base/common.h"
#include "frontend/audio/data_cache.h"
#include "frontend/data_cache.h"
#include "fst/symbol-table.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"

@ -14,7 +14,7 @@
#include "base/common.h"
#include "kaldi/decoder/decodable-itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "nnet/nnet_producer.h"

@ -15,7 +15,6 @@
#include "base/basic_types.h"
#include "kaldi/base/kaldi-types.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
DECLARE_int32(subsampling_rate);

@ -13,10 +13,10 @@
// limitations under the License.
#include "nnet/nnet_producer.h"
#include "matrix/kaldi-matrix.h"
namespace ppspeech {
using kaldi::Vector;
using std::vector;
using kaldi::BaseFloat;

@ -16,7 +16,7 @@
#include "base/common.h"
#include "base/safe_queue.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
#include "nnet/nnet_itf.h"
namespace ppspeech {

@ -18,7 +18,7 @@
#pragma once
#include "base/common.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "paddle/extension.h"
#include "paddle/jit/all.h"

@ -15,8 +15,8 @@
#include "base/common.h"
#include "decoder/param.h"
#include "frontend/audio/assembler.h"
#include "frontend/audio/data_cache.h"
#include "frontend/assembler.h"
#include "frontend/data_cache.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/u2_nnet.h"

@ -15,7 +15,7 @@ set(TEST_BINS
foreach(bin_name IN LISTS TEST_BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-feat-common)
target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS})
target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})

@ -18,7 +18,7 @@
#include "decoder/ctc_beam_search_opt.h"
#include "decoder/ctc_prefix_beam_search_decoder.h"
#include "decoder/decoder_itf.h"
#include "frontend/audio/feature_pipeline.h"
#include "frontend/feature_pipeline.h"
#include "fst/fstlib.h"
#include "fst/symbol-table.h"
#include "nnet/decodable.h"

@ -13,7 +13,7 @@
// limitations under the License.
#include "decoder/param.h"
#include "kaldi/feat/wave-reader.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "recognizer/u2_recognizer.h"

@ -14,7 +14,7 @@
#include "recognizer/u2_recognizer.h"
#include "decoder/param.h"
#include "kaldi/feat/wave-reader.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");

@ -4,6 +4,8 @@ ${CMAKE_CURRENT_SOURCE_DIR}/../
)
add_subdirectory(utils)
add_subdirectory(matrix)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/frontend
)

@ -1,2 +1,28 @@
add_library(kaldi-native-fbank-core
feature-fbank.cc
feature-functions.cc
feature-window.cc
fftsg.c
mel-computations.cc
rfft.cc
)
add_subdirectory(audio)
add_library(frontend STATIC
cmvn.cc
audio_cache.cc
feature_cache.cc
feature_pipeline.cc
assembler.cc
wave-reader.cc
)
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
set(BINS
compute_fbank_main
)
foreach(bin_name IN LISTS BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
endforeach()

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/assembler.h"
#include "frontend/assembler.h"
namespace ppspeech {

@ -15,7 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
namespace ppspeech {

@ -1,27 +0,0 @@
add_library(kaldi-native-fbank-core
feature-fbank.cc
feature-functions.cc
feature-window.cc
fftsg.c
mel-computations.cc
rfft.cc
)
add_library(frontend STATIC
cmvn.cc
audio_cache.cc
feature_cache.cc
feature_pipeline.cc
assembler.cc
)
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
set(BINS
compute_fbank_main
)
foreach(bin_name IN LISTS BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog kaldi-feat-common)
endforeach()

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/audio_cache.h"
#include "frontend/audio_cache.h"
#include "kaldi/base/timer.h"

@ -16,7 +16,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
namespace ppspeech {

@ -13,7 +13,7 @@
// limitations under the License.
#include "frontend/audio/cmvn.h"
#include "frontend/cmvn.h"
#include "utils/file_utils.h"
#include "utils/picojson.h"

@ -15,8 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "frontend/frontend_itf.h"
#include "kaldi/util/options-itf.h"
namespace ppspeech {

@ -16,13 +16,13 @@
#include "base/flags.h"
#include "base/log.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/normalizer.h"
#include "kaldi/feat/wave-reader.h"
#include "frontend/audio_cache.h"
#include "frontend/data_cache.h"
#include "frontend/fbank.h"
#include "frontend/feature_cache.h"
#include "frontend/frontend_itf.h"
#include "frontend/normalizer.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"

@ -16,7 +16,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
using std::vector;

@ -15,8 +15,8 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/feature-fbank.h"
#include "frontend/feature_common.h"
#include "frontend/feature-fbank.h"
namespace ppspeech {

@ -18,11 +18,11 @@
// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
//
#include "frontend/audio/feature-fbank.h"
#include "frontend/feature-fbank.h"
#include <cmath>
#include "frontend/audio/feature-functions.h"
#include "frontend/feature-functions.h"
namespace knf {

@ -23,9 +23,9 @@
#include <map>
#include "frontend/audio/feature-window.h"
#include "frontend/audio/mel-computations.h"
#include "frontend/audio/rfft.h"
#include "frontend/feature-window.h"
#include "frontend/mel-computations.h"
#include "frontend/rfft.h"
namespace knf {

@ -18,7 +18,7 @@
// This file is copied/modified from kaldi/src/feat/feature-functions.cc
#include "frontend/audio/feature-functions.h"
#include "frontend/feature-functions.h"
#include <cstdint>
#include <vector>

@ -4,7 +4,7 @@
// This file is copied/modified from kaldi/src/feat/feature-window.cc
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
#include <cmath>
#include <vector>

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/feature_cache.h"
#include "frontend/feature_cache.h"
namespace ppspeech {

@ -15,7 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
namespace ppspeech {

@ -15,7 +15,7 @@
#pragma once
#include "frontend_itf.h"
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
namespace ppspeech {
@ -52,4 +52,4 @@ class StreamingFeatureTpl : public FrontendInterface {
} // namespace ppspeech
#include "frontend/audio/feature_common_inl.h"
#include "frontend/feature_common_inl.h"

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/feature_pipeline.h"
#include "frontend/feature_pipeline.h"
namespace ppspeech {

@ -16,13 +16,13 @@
#pragma once
#include "frontend/audio/assembler.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/normalizer.h"
#include "frontend/assembler.h"
#include "frontend/audio_cache.h"
#include "frontend/data_cache.h"
#include "frontend/fbank.h"
#include "frontend/feature_cache.h"
#include "frontend/frontend_itf.h"
#include "frontend/cmvn.h"
// feature
DECLARE_bool(fill_zero);

@ -15,7 +15,7 @@
#pragma once
#include "base/basic_types.h"
#include "kaldi/matrix/kaldi-vector.h"
#include "matrix/kaldi-vector.h"
namespace ppspeech {

@ -18,12 +18,12 @@
// This file is copied/modified from kaldi/src/feat/mel-computations.cc
#include "frontend/audio/mel-computations.h"
#include "frontend/mel-computations.h"
#include <algorithm>
#include <sstream>
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
namespace knf {

@ -22,7 +22,7 @@
#include <cmath>
#include <string>
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
namespace knf {

@ -14,5 +14,4 @@
#pragma once
#include "frontend/audio/cmvn.h"
#include "frontend/audio/db_norm.h"
#include "frontend/cmvn.h"

@ -16,7 +16,7 @@
* limitations under the License.
*/
#include "frontend/audio/rfft.h"
#include "frontend/rfft.h"
#include <cmath>
#include <vector>

@ -25,7 +25,7 @@
#include <sstream>
#include <vector>
#include "feat/wave-reader.h"
#include "frontend/wave-reader.h"
#include "base/kaldi-error.h"
#include "base/kaldi-utils.h"

@ -0,0 +1,7 @@
add_library(kaldi-matrix
kaldi-matrix.cc
kaldi-vector.cc
)
target_link_libraries(kaldi-matrix kaldi-base)

@ -28,7 +28,7 @@ namespace kaldi {
template<typename Real>
Matrix<Real>::Matrix(): MatrixBase<Real>(NULL, 0, 0, 0) { }
/*
template<>
template<>
void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra, const VectorBase<float> &rb);
@ -36,6 +36,7 @@ void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra
template<>
template<>
void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
*/
template<typename Real>
inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {

@ -23,17 +23,9 @@
// limitations under the License.
#include "matrix/kaldi-matrix.h"
#include "matrix/sp-matrix.h"
#include "matrix/jama-svd.h"
#include "matrix/jama-eig.h"
#include "matrix/compressed-matrix.h"
#include "matrix/sparse-matrix.h"
static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
"kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
namespace kaldi {
/*
template<typename Real>
void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
bool inverse_needed) {
@ -206,29 +198,30 @@ void MatrixBase<Real>::SetMatMatDivMat(const MatrixBase<Real>& A,
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::CopyLowerToUpper() {
KALDI_ASSERT(num_rows_ == num_cols_);
Real *data = data_;
MatrixIndexT num_rows = num_rows_, stride = stride_;
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < i; j++)
data[j * stride + i ] = data[i * stride + j];
}
//template<typename Real>
//void MatrixBase<Real>::CopyLowerToUpper() {
//KALDI_ASSERT(num_rows_ == num_cols_);
//Real *data = data_;
//MatrixIndexT num_rows = num_rows_, stride = stride_;
//for (int32 i = 0; i < num_rows; i++)
//for (int32 j = 0; j < i; j++)
//data[j * stride + i ] = data[i * stride + j];
//}
template<typename Real>
void MatrixBase<Real>::CopyUpperToLower() {
KALDI_ASSERT(num_rows_ == num_cols_);
Real *data = data_;
MatrixIndexT num_rows = num_rows_, stride = stride_;
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < i; j++)
data[i * stride + j] = data[j * stride + i];
}
//template<typename Real>
//void MatrixBase<Real>::CopyUpperToLower() {
//KALDI_ASSERT(num_rows_ == num_cols_);
//Real *data = data_;
//MatrixIndexT num_rows = num_rows_, stride = stride_;
//for (int32 i = 0; i < num_rows; i++)
//for (int32 j = 0; j < i; j++)
//data[i * stride + j] = data[j * stride + i];
//}
/*
template<typename Real>
void MatrixBase<Real>::SymAddMat2(const Real alpha,
const MatrixBase<Real> &A,
@ -734,7 +727,7 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
}
#endif
*/
// Copy constructor. Copies data to newly allocated memory.
template<typename Real>
Matrix<Real>::Matrix (const MatrixBase<Real> & M,
@ -898,6 +891,7 @@ template
void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
MatrixTransposeType Trans);
/*
// Specialize the template for CopyFromSp for float, float.
template<>
template<>
@ -992,7 +986,7 @@ template
void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
MatrixTransposeType trans);
*/
template<typename Real>
void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
if (rv.Dim() == num_rows_*num_cols_) {
@ -1076,7 +1070,6 @@ void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
}
}
template<typename Real>
void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIndexT row) {
KALDI_ASSERT(rv.Dim() == num_cols_ &&
@ -1088,7 +1081,7 @@ void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIn
std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real));
}
/*
template<typename Real>
void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
KALDI_ASSERT(rv.Dim() == std::min(num_cols_, num_rows_));
@ -1096,7 +1089,7 @@ void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
Real *my_data = this->Data();
for (; rv_data != rv_end; rv_data++, my_data += (this->stride_+1))
*my_data = *rv_data;
}
}*/
template<typename Real>
void MatrixBase<Real>::CopyColFromVec(const VectorBase<Real> &rv,
@ -1135,7 +1128,7 @@ void Matrix<Real>::Destroy() {
}
/*
template<typename Real>
void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
@ -1325,6 +1318,7 @@ void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::SetZero() {
@ -1344,6 +1338,7 @@ void MatrixBase<Real>::Set(Real value) {
}
}
/*
template<typename Real>
void MatrixBase<Real>::SetUnit() {
SetZero();
@ -1374,6 +1369,7 @@ void MatrixBase<Real>::SetRandUniform() {
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
@ -1420,23 +1416,11 @@ void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
template<typename Real>
void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
if (add) {
Matrix<Real> tmp(num_rows_, num_cols_);
tmp.Read(is, binary, false); // read without adding.
if (tmp.num_rows_ != this->num_rows_ || tmp.num_cols_ != this->num_cols_)
KALDI_ERR << "MatrixBase::Read, size mismatch "
<< this->num_rows_ << ", " << this->num_cols_
<< " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
this->AddMat(1.0, tmp);
return;
}
// now assume add == false.
void MatrixBase<Real>::Read(std::istream & is, bool binary) {
// In order to avoid rewriting this, we just declare a Matrix and
// use it to read the data, then copy.
Matrix<Real> tmp;
tmp.Read(is, binary, false);
tmp.Read(is, binary);
if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) {
KALDI_ERR << "MatrixBase<Real>::Read, size mismatch "
<< NumRows() << " x " << NumCols() << " versus "
@ -1447,23 +1431,7 @@ void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
template<typename Real>
void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
if (add) {
Matrix<Real> tmp;
tmp.Read(is, binary, false); // read without adding.
if (this->num_rows_ == 0) this->Resize(tmp.num_rows_, tmp.num_cols_);
else {
if (this->num_rows_ != tmp.num_rows_ || this->num_cols_ != tmp.num_cols_) {
if (tmp.num_rows_ == 0) return; // do nothing in this case.
else KALDI_ERR << "Matrix::Read, size mismatch "
<< this->num_rows_ << ", " << this->num_cols_
<< " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
}
}
this->AddMat(1.0, tmp);
return;
}
void Matrix<Real>::Read(std::istream & is, bool binary) {
// now assume add == false.
MatrixIndexT pos_at_start = is.tellg();
std::ostringstream specific_error;
@ -1472,10 +1440,10 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
int peekval = Peek(is, binary);
if (peekval == 'C') {
// This code enables us to read CompressedMatrix as a regular matrix.
CompressedMatrix compressed_mat;
compressed_mat.Read(is, binary); // at this point, add == false.
this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
compressed_mat.CopyToMat(this);
//CompressedMatrix compressed_mat;
//compressed_mat.Read(is, binary); // at this point, add == false.
//this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
//compressed_mat.CopyToMat(this);
return;
}
const char *my_token = (sizeof(Real) == 4 ? "FM" : "DM");
@ -1483,7 +1451,7 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
if (peekval == other_token_start) { // need to instantiate the other type to read it.
typedef typename OtherReal<Real>::Real OtherType; // if Real == float, OtherType == double, and vice versa.
Matrix<OtherType> other(this->num_rows_, this->num_cols_);
other.Read(is, binary, false); // add is false at this point anyway.
other.Read(is, binary); // add is false at this point anyway.
this->Resize(other.NumRows(), other.NumCols());
this->CopyFromMat(other);
return;
@ -1672,7 +1640,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
}
}
/*
template<typename Real>
void MatrixBase<Real>::Add(const Real alpha) {
Real *data = data_;
@ -1812,15 +1780,15 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
for(int32 i = 0; i < NumRows(); i++)
(*this)(i, i) *= 1.00001;
}*/
bool ans = JamaSvd(s, U, Vt);
if (Vt != NULL) Vt->Transpose(); // possibly to do: change this and also the transpose inside the JamaSvd routine. note, Vt is square.
if (!ans) {
KALDI_ERR << "Error doing Svd"; // This one will be caught.
}
#endif
if (prescale != 1.0) s->Scale(1.0/prescale);
}
// bool ans = JamaSvd(s, U, Vt);
//if (Vt != NULL) Vt->Transpose(); // possibly to do: change this and also the transpose inside the JamaSvd routine. note, Vt is square.
//if (!ans) {
//KALDI_ERR << "Error doing Svd"; // This one will be caught.
//}
//#endif
//if (prescale != 1.0) s->Scale(1.0/prescale);
//}
/*
template<typename Real>
void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
try {
@ -2052,17 +2020,18 @@ void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
if (log_det) *log_det = log_det_tmp;
if (det_sign) *det_sign = det_sign_tmp;
}
*/
template<class Real>
void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
mat.CopyToMat(this);
}
//template<class Real>
//void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
//mat.CopyToMat(this);
//}
template<class Real>
Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
Resize(M.NumRows(), M.NumCols(), kUndefined);
M.CopyToMat(this);
}
//template<class Real>
//Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
//Resize(M.NumRows(), M.NumCols(), kUndefined);
//M.CopyToMat(this);
//}
@ -2074,7 +2043,7 @@ void MatrixBase<Real>::InvertElements() {
}
}
}
/*
template<typename Real>
void MatrixBase<Real>::Transpose() {
KALDI_ASSERT(num_rows_ == num_cols_);
@ -2250,7 +2219,7 @@ bool MatrixBase<Real>::Power(Real power) {
(*this).AddMatMat(1.0, tmp, kNoTrans, P, kNoTrans, 0.0);
return true;
}
*/
template<typename Real>
void Matrix<Real>::Swap(Matrix<Real> *other) {
std::swap(this->data_, other->data_);
@ -2258,7 +2227,7 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
std::swap(this->num_rows_, other->num_rows_);
std::swap(this->stride_, other->stride_);
}
/*
// Repeating this comment that appeared in the header:
// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
// P^{-1}. Be careful: the relationship of D to the eigenvalues we output is
@ -2298,7 +2267,7 @@ void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
// INT_32 mVersion;
// INT_32 mSampSize;
// };
/*
template<typename Real>
bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
{
@ -2821,7 +2790,7 @@ void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
const MatrixIndexT *indices) {
@ -2847,7 +2816,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
}
}
/*
template<typename Real>
void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
const MatrixIndexT *indices) {
@ -2871,8 +2840,9 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
this_data[c] += src_data[*index_ptr];
}
}
}
}*/
/*
template<typename Real>
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
const MatrixIndexT *indices) {
@ -3022,9 +2992,9 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
value_data += value_stride;
diff_data += diff_stride;
}
}
}*/
/*
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
@ -3087,7 +3057,7 @@ template void MatrixBase<double>::AddVecToCols(const double alpha,
const VectorBase<float> &v);
template void MatrixBase<double>::AddVecToCols(const double alpha,
const VectorBase<double> &v);
*/
//Explicit instantiation of the classes
//Apparently, it seems to be necessary that the instantiation
//happens at the end of the file. Otherwise, not all the member

@ -32,13 +32,6 @@ namespace kaldi {
/// @{ \addtogroup matrix_funcs_scalar
/// We need to declare this here as it will be a friend function.
/// tr(A B), or tr(A B^T).
template<typename Real>
Real TraceMatMat(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
MatrixTransposeType trans = kNoTrans);
/// @}
/// \addtogroup matrix_group
/// @{
@ -50,15 +43,8 @@ class MatrixBase {
public:
// so this child can access protected members of other instances.
friend class Matrix<Real>;
friend class SubMatrix<Real>;
// friend declarations for CUDA matrices (see ../cudamatrix/)
friend class CuMatrixBase<Real>;
friend class CuMatrix<Real>;
friend class CuSubMatrix<Real>;
friend class CuPackedMatrix<Real>;
friend class PackedMatrix<Real>;
friend class SparseMatrix<Real>;
friend class SparseMatrix<float>;
friend class SparseMatrix<double>;
/// Returns number of rows (or zero for empty matrix).
inline MatrixIndexT NumRows() const { return num_rows_; }
@ -127,14 +113,6 @@ class MatrixBase {
/// Sets all elements to a specific value.
void Set(Real);
/// Sets to zero, except ones along diagonal [for non-square matrices too]
void SetUnit();
/// Sets to random values of a normal distribution
void SetRandn();
/// Sets to numbers uniformly distributed on (0, 1)
void SetRandUniform();
/* Copying functions. These do not resize the matrix! */
/// Copy given matrix. (no resize is done).
template<typename OtherReal>
@ -142,21 +120,17 @@ class MatrixBase {
MatrixTransposeType trans = kNoTrans);
/// Copy from compressed matrix.
void CopyFromMat(const CompressedMatrix &M);
/// Copy given spmatrix. (no resize is done).
template<typename OtherReal>
void CopyFromSp(const SpMatrix<OtherReal> &M);
//void CopyFromMat(const CompressedMatrix &M);
/// Copy given tpmatrix. (no resize is done).
template<typename OtherReal>
void CopyFromTp(const TpMatrix<OtherReal> &M,
MatrixTransposeType trans = kNoTrans);
//template<typename OtherReal>
//void CopyFromTp(const TpMatrix<OtherReal> &M,
//MatrixTransposeType trans = kNoTrans);
/// Copy from CUDA matrix. Implemented in ../cudamatrix/cu-matrix.h
template<typename OtherReal>
void CopyFromMat(const CuMatrixBase<OtherReal> &M,
MatrixTransposeType trans = kNoTrans);
//template<typename OtherReal>
//void CopyFromMat(const CuMatrixBase<OtherReal> &M,
//MatrixTransposeType trans = kNoTrans);
/// This function has two modes of operation. If v.Dim() == NumRows() *
/// NumCols(), then treats the vector as a row-by-row concatenation of a
@ -165,7 +139,7 @@ class MatrixBase {
void CopyRowsFromVec(const VectorBase<Real> &v);
/// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
void CopyRowsFromVec(const CuVectorBase<Real> &v);
//void CopyRowsFromVec(const CuVectorBase<Real> &v);
template<typename OtherReal>
void CopyRowsFromVec(const VectorBase<OtherReal> &v);
@ -215,7 +189,7 @@ class MatrixBase {
return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
}
/* Various special functions. */
/*
/// Returns sum of all elements in matrix.
Real Sum() const;
/// Returns trace of matrix.
@ -268,15 +242,16 @@ class MatrixBase {
/// Does inversion in double precision even if matrix was not double.
void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL,
bool inverse_needed = true);
*/
/// Inverts all the elements of the matrix
void InvertElements();
/*
/// Transpose the matrix. This one is only
/// applicable to square matrices (the one in the
/// Matrix child class works also for non-square.
void Transpose();
*/
/// Copies column r from column indices[r] of src.
/// As a special case, if indexes[i] == -1, sets column i to zero.
/// all elements of "indices" must be in [-1, src.NumCols()-1],
@ -296,8 +271,8 @@ class MatrixBase {
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const MatrixBase<Real> &src,
const MatrixIndexT *indices);
//void AddCols(const MatrixBase<Real> &src,
// const MatrixIndexT *indices);
/// Copies row r of this matrix from an array of floats at the location given
/// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
@ -314,30 +289,30 @@ class MatrixBase {
/// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
/// If indexes[r] < 0, does not add anything. all elements of "indexes" must
/// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols().
void AddRows(Real alpha,
const MatrixBase<Real> &src,
const MatrixIndexT *indexes);
// void AddRows(Real alpha,
// const MatrixBase<Real> &src,
// const MatrixIndexT *indexes);
/// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the
/// beginning of a region of memory representing a vector of floats, of the
/// same length as this.NumCols(). If src[r] is NULL, does not add anything.
void AddRows(Real alpha, const Real *const *src);
//void AddRows(Real alpha, const Real *const *src);
/// For each row r of this matrix, adds it (times alpha) to the array of
/// floats at the location given by dst[r]. If dst[r] is NULL, does not do
/// anything for that row. Requires that none of the memory regions pointed
/// to by the pointers in "dst" overlap (e.g. none of the pointers should be
/// the same).
void AddToRows(Real alpha, Real *const *dst) const;
//void AddToRows(Real alpha, Real *const *dst) const;
/// For each row i of *this, adds this->Row(i) to
/// dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing.
/// Requires that all the indexes[i] that are >= 0
/// be distinct, otherwise the behavior is undefined.
void AddToRows(Real alpha,
const MatrixIndexT *indexes,
MatrixBase<Real> *dst) const;
//void AddToRows(Real alpha,
// const MatrixIndexT *indexes,
// MatrixBase<Real> *dst) const;
/*
inline void ApplyPow(Real power) {
this -> Pow(*this, power);
}
@ -374,7 +349,7 @@ class MatrixBase {
inline void ApplyLog() {
this -> Log(*this);
}
*/
/// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
/// P^{-1}. Be careful: the relationship of D to the eigenvalues we output is
/// slightly complicated, due to the need for P to be real. In the symmetric
@ -389,9 +364,9 @@ class MatrixBase {
/// instead (*this) P = P D.
///
/// The non-member function CreateEigenvalueMatrix creates D from eigs_real and eigs_imag.
void Eig(MatrixBase<Real> *P,
VectorBase<Real> *eigs_real,
VectorBase<Real> *eigs_imag) const;
//void Eig(MatrixBase<Real> *P,
// VectorBase<Real> *eigs_real,
// VectorBase<Real> *eigs_imag) const;
/// The Power method attempts to take the matrix to a power using a method that
/// works in general for fractional and negative powers. The input matrix must
@ -400,7 +375,7 @@ class MatrixBase {
/// return false and leave the matrix unchanged, if at entry the matrix had
/// real negative eigenvalues (or if it had zero eigenvalues and the power was
/// negative).
bool Power(Real pow);
// bool Power(Real pow);
/** Singular value decomposition
Major limitations:
@ -413,31 +388,32 @@ class MatrixBase {
expect that S.Dim() == m, U is either NULL or m by n,
and v is either NULL or n by n.
The singular values are not sorted (use SortSvd for that). */
void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
MatrixBase<Real> *Vt); // Destroys calling matrix.
//void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
// MatrixBase<Real> *Vt); // Destroys calling matrix.
/// Compute SVD (*this) = U diag(s) Vt. Note that the V in the call is already
/// transposed; the normal formulation is U diag(s) V^T.
/// Null pointers for U or V mean we don't want that output (this saves
/// compute). The singular values are not sorted (use SortSvd for that).
void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
MatrixBase<Real> *Vt) const;
//void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
// MatrixBase<Real> *Vt) const;
/// Compute SVD but only retain the singular values.
void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
//void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
/// Returns smallest singular value.
Real MinSingularValue() const {
Vector<Real> tmp(std::min(NumRows(), NumCols()));
Svd(&tmp);
return tmp.Min();
}
//Real MinSingularValue() const {
// Vector<Real> tmp(std::min(NumRows(), NumCols()));
//Svd(&tmp);
//return tmp.Min();
//}
void TestUninitialized() const; // This function is designed so that if any element
//void TestUninitialized() const; // This function is designed so that if any element
// if the matrix is uninitialized memory, valgrind will complain.
/// Returns condition number by computing Svd. Works even if cols > rows.
/// Returns infinity if all singular values are zero.
/*
Real Cond() const;
/// Returns true if matrix is Symmetric.
@ -559,7 +535,7 @@ class MatrixBase {
// element-by-element, set *this = diff * (1.0 - value^2).
void DiffTanh(const MatrixBase<Real> &value,
const MatrixBase<Real> &diff);
*/
/** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
* semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
* orthogonal matrix so rP^{-1} = rP^T. Throws exception if input was not
@ -571,208 +547,15 @@ class MatrixBase {
* SpMatrix and use Eig() function there, which uses eigenvalue decomposition
* directly rather than SVD.
*/
void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
Real check_thresh = 0.001);
friend Real kaldi::TraceMatMat<Real>(const MatrixBase<Real> &A,
const MatrixBase<Real> &B, MatrixTransposeType trans); // tr (A B)
// so it can get around const restrictions on the pointer to data_.
friend class SubMatrix<Real>;
/// Add a scalar to each element
void Add(const Real alpha);
/// Add a scalar to each diagonal element.
void AddToDiag(const Real alpha);
/// *this += alpha * a * b^T
template<typename OtherReal>
void AddVecVec(const Real alpha, const VectorBase<OtherReal> &a,
const VectorBase<OtherReal> &b);
/// [each row of *this] += alpha * v
template<typename OtherReal>
void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
/// [each col of *this] += alpha * v
template<typename OtherReal>
void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);
/// *this += alpha * M [or M^T]
void AddMat(const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transA = kNoTrans);
/// *this += alpha * A [or A^T].
void AddSmat(Real alpha, const SparseMatrix<Real> &A,
MatrixTransposeType trans = kNoTrans);
/// (*this) = alpha * op(A) * B + beta * (*this), where A is sparse.
/// Multiplication of sparse with dense matrix. See also AddMatSmat.
void AddSmatMat(Real alpha, const SparseMatrix<Real> &A,
MatrixTransposeType transA, const MatrixBase<Real> &B,
Real beta);
/// (*this) = alpha * A * op(B) + beta * (*this), where B is sparse
/// and op(B) is either B or trans(B) depending on the 'transB' argument.
/// This is multiplication of a dense by a sparse matrix. See also
/// AddSmatMat.
void AddMatSmat(Real alpha, const MatrixBase<Real> &A,
const SparseMatrix<Real> &B, MatrixTransposeType transB,
Real beta);
/// *this = beta * *this + alpha * M M^T, for symmetric matrices. It only
/// updates the lower triangle of *this. It will leave the matrix asymmetric;
/// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
void SymAddMat2(const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transA, Real beta);
/// *this = beta * *this + alpha * diag(v) * M [or M^T].
/// The same as adding M but scaling each row M_i by v(i).
void AddDiagVecMat(const Real alpha, const VectorBase<Real> &v,
const MatrixBase<Real> &M, MatrixTransposeType transM,
Real beta = 1.0);
/// *this = beta * *this + alpha * M [or M^T] * diag(v)
/// The same as adding M but scaling each column M_j by v(j).
void AddMatDiagVec(const Real alpha,
const MatrixBase<Real> &M, MatrixTransposeType transM,
VectorBase<Real> &v,
Real beta = 1.0);
/// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
void AddMatMatElements(const Real alpha,
const MatrixBase<Real>& A,
const MatrixBase<Real>& B,
const Real beta);
/// *this += alpha * S
template<typename OtherReal>
void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);
void AddMatMat(const Real alpha,
const MatrixBase<Real>& A, MatrixTransposeType transA,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const Real beta);
/// *this = a * b / c (by element; when c = 0, *this = a)
void SetMatMatDivMat(const MatrixBase<Real>& A,
const MatrixBase<Real>& B,
const MatrixBase<Real>& C);
/// A version of AddMatMat specialized for when the second argument
/// contains a lot of zeroes.
void AddMatSmat(const Real alpha,
const MatrixBase<Real>& A, MatrixTransposeType transA,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const Real beta);
/// A version of AddMatMat specialized for when the first argument
/// contains a lot of zeroes.
void AddSmatMat(const Real alpha,
const MatrixBase<Real>& A, MatrixTransposeType transA,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const Real beta);
/// this <-- beta*this + alpha*A*B*C.
void AddMatMatMat(const Real alpha,
const MatrixBase<Real>& A, MatrixTransposeType transA,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const MatrixBase<Real>& C, MatrixTransposeType transC,
const Real beta);
/// this <-- beta*this + alpha*SpA*B.
// This and the routines below are really
// stubs that need to be made more efficient.
void AddSpMat(const Real alpha,
const SpMatrix<Real>& A,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const Real beta) {
Matrix<Real> M(A);
return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
}
/// this <-- beta*this + alpha*A*B.
void AddTpMat(const Real alpha,
const TpMatrix<Real>& A, MatrixTransposeType transA,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const Real beta) {
Matrix<Real> M(A);
return AddMatMat(alpha, M, transA, B, transB, beta);
}
/// this <-- beta*this + alpha*A*B.
void AddMatSp(const Real alpha,
const MatrixBase<Real>& A, MatrixTransposeType transA,
const SpMatrix<Real>& B,
const Real beta) {
Matrix<Real> M(B);
return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
}
/// this <-- beta*this + alpha*A*B*C.
void AddSpMatSp(const Real alpha,
const SpMatrix<Real> &A,
const MatrixBase<Real>& B, MatrixTransposeType transB,
const SpMatrix<Real>& C,
const Real beta) {
Matrix<Real> M(A), N(C);
return AddMatMatMat(alpha, M, kNoTrans, B, transB, N, kNoTrans, beta);
}
/// this <-- beta*this + alpha*A*B.
void AddMatTp(const Real alpha,
const MatrixBase<Real>& A, MatrixTransposeType transA,
const TpMatrix<Real>& B, MatrixTransposeType transB,
const Real beta) {
Matrix<Real> M(B);
return AddMatMat(alpha, A, transA, M, transB, beta);
}
/// this <-- beta*this + alpha*A*B.
void AddTpTp(const Real alpha,
const TpMatrix<Real>& A, MatrixTransposeType transA,
const TpMatrix<Real>& B, MatrixTransposeType transB,
const Real beta) {
Matrix<Real> M(A), N(B);
return AddMatMat(alpha, M, transA, N, transB, beta);
}
/// this <-- beta*this + alpha*A*B.
// This one is more efficient, not like the others above.
void AddSpSp(const Real alpha,
const SpMatrix<Real>& A, const SpMatrix<Real>& B,
const Real beta);
/// Copy lower triangle to upper triangle (symmetrize)
void CopyLowerToUpper();
/// Copy upper triangle to lower triangle (symmetrize)
void CopyUpperToLower();
/// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
/// process. It is only applicable if NumRows() <= NumCols(). It will use
/// random number generation to fill in rows with something nonzero, in cases
/// where the original matrix was of deficient row rank.
void OrthogonalizeRows();
/// stream read.
/// Use instead of stream<<*this, if you want to add to existing contents.
// Will throw exception on failure.
void Read(std::istream & in, bool binary, bool add = false);
void Read(std::istream & in, bool binary);
/// write to stream.
void Write(std::ostream & out, bool binary) const;
// Below is internal methods for Svd, user does not have to know about this.
#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
// protected:
// Should be protected but used directly in testing routine.
// destroys *this!
void LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U,
MatrixBase<Real> *Vt);
#else
protected:
// destroys *this!
bool JamaSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
MatrixBase<Real> *V);
#endif
protected:
/// Initializer, callable only from child.
@ -827,19 +610,9 @@ class Matrix : public MatrixBase<Real> {
MatrixStrideType stride_type = kDefaultStride):
MatrixBase<Real>() { Resize(r, c, resize_type, stride_type); }
/// Copy constructor from CUDA matrix
/// This is defined in ../cudamatrix/cu-matrix.h
template<typename OtherReal>
explicit Matrix(const CuMatrixBase<OtherReal> &cu,
MatrixTransposeType trans = kNoTrans);
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(Matrix<Real> *other);
/// Defined in ../cudamatrix/cu-matrix.cc
void Swap(CuMatrix<Real> *mat);
/// Constructor from any MatrixBase. Can also copy with transpose.
/// Allocates new memory.
explicit Matrix(const MatrixBase<Real> & M,
@ -853,40 +626,29 @@ class Matrix : public MatrixBase<Real> {
explicit Matrix(const MatrixBase<OtherReal> & M,
MatrixTransposeType trans = kNoTrans);
/// Copy constructor taking SpMatrix...
/// It is symmetric, so no option for transpose, and NumRows == Cols
template<typename OtherReal>
explicit Matrix(const SpMatrix<OtherReal> & M) : MatrixBase<Real>() {
Resize(M.NumRows(), M.NumRows(), kUndefined);
this->CopyFromSp(M);
}
/// Constructor from CompressedMatrix
explicit Matrix(const CompressedMatrix &C);
/// Copy constructor taking TpMatrix...
template <typename OtherReal>
explicit Matrix(const TpMatrix<OtherReal> & M,
MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
if (trans == kNoTrans) {
Resize(M.NumRows(), M.NumCols(), kUndefined);
this->CopyFromTp(M);
} else {
Resize(M.NumCols(), M.NumRows(), kUndefined);
this->CopyFromTp(M, kTrans);
}
}
//template <typename OtherReal>
//explicit Matrix(const TpMatrix<OtherReal> & M,
//MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
//if (trans == kNoTrans) {
//Resize(M.NumRows(), M.NumCols(), kUndefined);
//this->CopyFromTp(M);
//} else {
//Resize(M.NumCols(), M.NumRows(), kUndefined);
//this->CopyFromTp(M, kTrans);
//}
//}
/// read from stream.
// Unlike one in base, allows resizing.
void Read(std::istream & in, bool binary, bool add = false);
void Read(std::istream & in, bool binary);
/// Remove a specified row.
void RemoveRow(MatrixIndexT i);
/// Transpose the matrix. Works for non-square
/// matrices as well as square ones.
void Transpose();
//void Transpose();
/// Distructor to free matrices.
~Matrix() { Destroy(); }
@ -947,37 +709,6 @@ class Matrix : public MatrixBase<Real> {
/// A structure containing the HTK header.
/// [TODO: change the style of the variables to Kaldi-compliant]
struct HtkHeader {
/// Number of samples.
int32 mNSamples;
/// Sample period.
int32 mSamplePeriod;
/// Sample size
int16 mSampleSize;
/// Sample kind.
uint16 mSampleKind;
};
// Read HTK formatted features from file into matrix.
template<typename Real>
bool ReadHtk(std::istream &is, Matrix<Real> *M, HtkHeader *header_ptr);
// Write (HTK format) features to file from matrix.
template<typename Real>
bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr);
// Write (CMUSphinx format) features to file from matrix.
template<typename Real>
bool WriteSphinx(std::ostream &os, const MatrixBase<Real> &M);
/// @} end of "addtogroup matrix_funcs_io"
/**
Sub-matrix representation.
Can work with sub-parts of a matrix using this class.
Note that SubMatrix is not very const-correct-- it allows you to
change the contents of a const Matrix. Be careful!
*/
template<typename Real>
class SubMatrix : public MatrixBase<Real> {
@ -1012,6 +743,7 @@ class SubMatrix : public MatrixBase<Real> {
/// Disallow assignment.
SubMatrix<Real> &operator = (const SubMatrix<Real> &other);
};
/// @} End of "addtogroup matrix_funcs_io".
/// \addtogroup matrix_funcs_scalar
@ -1019,7 +751,7 @@ class SubMatrix : public MatrixBase<Real> {
// Some declarations. These are traces of products.
/************************
template<typename Real>
bool ApproxEqual(const MatrixBase<Real> &A,
const MatrixBase<Real> &B, Real tol = 0.01) {
@ -1085,7 +817,7 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
template<typename Real>
bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
**********/
/// @} end of addtogroup matrix_funcs_misc
@ -1101,7 +833,6 @@ std::istream & operator >> (std::istream & In, MatrixBase<Real> & M);
template<typename Real>
std::istream & operator >> (std::istream & In, Matrix<Real> & M);
template<typename Real>
bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());

@ -44,14 +44,14 @@ std::istream &operator >> (std::istream &is, Vector<Real> &rv) {
return is;
}
template<>
template<>
void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
template<>
template<>
void VectorBase<double>::AddVec<double>(const double alpha,
const VectorBase<double> &rv);
//template<>
//template<>
//void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
//template<>
//template<>
//void VectorBase<double>::AddVec<double>(const double alpha,
//const VectorBase<double> &rv);
} // namespace kaldi

@ -0,0 +1,345 @@
// matrix/kaldi-vector.h
// Copyright 2009-2012 Ondrej Glembek; Microsoft Corporation; Lukas Burget;
// Saarland University (Author: Arnab Ghoshal);
// Ariya Rastrow; Petr Schwarz; Yanmin Qian;
// Karel Vesely; Go Vivace Inc.; Arnab Ghoshal
// Wei Shi;
// 2015 Guoguo Chen
// 2017 Daniel Galvez
// 2019 Yiwen Shao
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_KALDI_VECTOR_H_
#define KALDI_MATRIX_KALDI_VECTOR_H_ 1
#include "matrix/matrix-common.h"
namespace kaldi {
/// \addtogroup matrix_group
/// @{
/// Provides a vector abstraction class.
/// This class provides a way to work with vectors in kaldi.
/// It encapsulates basic operations and memory optimizations.
template<typename Real>
class VectorBase {
public:
/// Set vector to all zeros.
void SetZero();
/// Returns true if matrix is all zeros.
bool IsZero(Real cutoff = 1.0e-06) const; // replace magic number
/// Set all members of a vector to a specified value.
void Set(Real f);
/// Returns the dimension of the vector.
inline MatrixIndexT Dim() const { return dim_; }
/// Returns the size in memory of the vector, in bytes.
inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
/// Returns a pointer to the start of the vector's data.
inline Real* Data() { return data_; }
/// Returns a pointer to the start of the vector's data (const).
inline const Real* Data() const { return data_; }
/// Indexing operator (const).
inline Real operator() (MatrixIndexT i) const {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(dim_));
return *(data_ + i);
}
/// Indexing operator (non-const).
inline Real & operator() (MatrixIndexT i) {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(dim_));
return *(data_ + i);
}
/** @brief Returns a sub-vector of a vector (a range of elements).
* @param o [in] Origin, 0 < o < Dim()
* @param l [in] Length 0 < l < Dim()-o
* @return A SubVector object that aliases the data of the Vector object.
* See @c SubVector class for details */
SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
return SubVector<Real>(*this, o, l);
}
/** @brief Returns a const sub-vector of a vector (a range of elements).
* @param o [in] Origin, 0 < o < Dim()
* @param l [in] Length 0 < l < Dim()-o
* @return A SubVector object that aliases the data of the Vector object.
* See @c SubVector class for details */
const SubVector<Real> Range(const MatrixIndexT o,
const MatrixIndexT l) const {
return SubVector<Real>(*this, o, l);
}
/// Copy data from another vector (must match own size).
void CopyFromVec(const VectorBase<Real> &v);
/// Copy data from another vector of different type (double vs. float)
template<typename OtherReal>
void CopyFromVec(const VectorBase<OtherReal> &v);
/// Performs a row stack of the matrix M
void CopyRowsFromMat(const MatrixBase<Real> &M);
template<typename OtherReal>
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
/// Performs a column stack of the matrix M
void CopyColsFromMat(const MatrixBase<Real> &M);
/// Extracts a row of the matrix M. Could also do this with
/// this->Copy(M[row]).
void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
/// Extracts a row of the matrix M with type conversion.
template<typename OtherReal>
void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
/// Extracts a column of the matrix M.
template<typename OtherReal>
void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
/// Reads from C++ stream (option to add to existing contents).
/// Throws exception on failure
void Read(std::istream &in, bool binary);
/// Writes to C++ stream (option to write in binary).
void Write(std::ostream &Out, bool binary) const;
friend class VectorBase<double>;
friend class VectorBase<float>;
protected:
/// Destructor; does not deallocate memory, this is handled by child classes.
/// This destructor is protected so this object can only be
/// deleted via a child.
~VectorBase() {}
/// Empty initializer, corresponds to vector of zero size.
explicit VectorBase(): data_(NULL), dim_(0) {
KALDI_ASSERT_IS_FLOATING_TYPE(Real);
}
/// data memory area
Real* data_;
/// dimension of vector
MatrixIndexT dim_;
KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
}; // class VectorBase
/** @brief A class representing a vector.
*
* This class provides a way to work with vectors in kaldi.
* It encapsulates basic operations and memory optimizations. */
template<typename Real>
class Vector: public VectorBase<Real> {
public:
/// Constructor that takes no arguments. Initializes to empty.
Vector(): VectorBase<Real>() {}
/// Constructor with specific size. Sets to all-zero by default
/// if set_zero == false, memory contents are undefined.
explicit Vector(const MatrixIndexT s,
MatrixResizeType resize_type = kSetZero)
: VectorBase<Real>() { Resize(s, resize_type); }
/// Copy constructor from CUDA vector
/// This is defined in ../cudamatrix/cu-vector.h
//template<typename OtherReal>
//explicit Vector(const CuVectorBase<OtherReal> &cu);
/// Copy constructor. The need for this is controversial.
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
/// Copy-constructor from base-class, needed to copy from SubVector.
explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
/// Type conversion constructor.
template<typename OtherReal>
explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
// Took this out since it is unsafe : Arnab
// /// Constructor from a pointer and a size; copies the data to a location
// /// it owns.
// Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
// Resize(s);
// CopyFromPtr(Data, s);
// }
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(Vector<Real> *other);
/// Destructor. Deallocates memory.
~Vector() { Destroy(); }
/// Read function using C++ streams. Can also add to existing contents
/// of matrix.
void Read(std::istream &in, bool binary);
/// Set vector to a specified size (can be zero).
/// The value of the new data depends on resize_type:
/// -if kSetZero, the new data will be zero
/// -if kUndefined, the new data will be undefined
/// -if kCopyData, the new data will be the same as the old data in any
/// shared positions, and zero elsewhere.
/// This function takes time proportional to the number of data elements.
void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
/// Remove one element and shifts later elements down.
void RemoveElement(MatrixIndexT i);
/// Assignment operator.
Vector<Real> &operator = (const Vector<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
/// Assignment operator that takes VectorBase.
Vector<Real> &operator = (const VectorBase<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
private:
/// Init assumes the current contents of the class are invalid (i.e. junk or
/// has already been freed), and it sets the vector to newly allocated memory
/// with the specified dimension. dim == 0 is acceptable. The memory contents
/// pointed to by data_ will be undefined.
void Init(const MatrixIndexT dim);
/// Destroy function, called internally.
void Destroy();
};
/// Represents a non-allocating general vector which can be defined
/// as a sub-vector of higher-level vector [or as the row of a matrix].
template<typename Real>
class SubVector : public VectorBase<Real> {
public:
/// Constructor from a Vector or SubVector.
/// SubVectors are not const-safe and it's very hard to make them
/// so for now we just give up. This function contains const_cast.
SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
const MatrixIndexT length) : VectorBase<Real>() {
// following assert equiv to origin>=0 && length>=0 &&
// origin+length <= rt.dim_
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
static_cast<UnsignedMatrixIndexT>(length) <=
static_cast<UnsignedMatrixIndexT>(t.Dim()));
VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
VectorBase<Real>::dim_ = length;
}
/// This constructor initializes the vector to point at the contents
/// of this packed matrix (SpMatrix or TpMatrix).
// SubVector(const PackedMatrix<Real> &M) {
//VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
//VectorBase<Real>::dim_ = (M.NumRows()*(M.NumRows()+1))/2;
//}
/// Copy constructor
SubVector(const SubVector &other) : VectorBase<Real> () {
// this copy constructor needed for Range() to work in base class.
VectorBase<Real>::data_ = other.data_;
VectorBase<Real>::dim_ = other.dim_;
}
/// Constructor from a pointer to memory and a length. Keeps a pointer
/// to the data but does not take ownership (will never delete).
/// Caution: this constructor enables you to evade const constraints.
SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
VectorBase<Real>::data_ = const_cast<Real*>(data);
VectorBase<Real>::dim_ = length;
}
/// This operation does not preserve const-ness, so be careful.
SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
VectorBase<Real>::dim_ = matrix.NumCols();
}
~SubVector() {} ///< Destructor (does nothing; no pointers are owned here).
private:
/// Disallow assignment operator.
SubVector & operator = (const SubVector &other) {}
};
/// @} end of "addtogroup matrix_group"
/// \addtogroup matrix_funcs_io
/// @{
/// Output to a C++ stream. Non-binary by default (use Write for
/// binary output).
template<typename Real>
std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
/// Input from a C++ stream. Will automatically read text or
/// binary data from the stream.
template<typename Real>
std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
/// Input from a C++ stream. Will automatically read text or
/// binary data from the stream.
template<typename Real>
std::istream & operator >> (std::istream & in, Vector<Real> & v);
/// @} end of \addtogroup matrix_funcs_io
/// \addtogroup matrix_funcs_scalar
/// @{
//template<typename Real>
//bool ApproxEqual(const VectorBase<Real> &a,
//const VectorBase<Real> &b, Real tol = 0.01) {
//return a.ApproxEqual(b, tol);
//}
//template<typename Real>
//inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
//float tol = 0.01) {
//KALDI_ASSERT(a.ApproxEqual(b, tol));
//}
} // namespace kaldi
// we need to include the implementation
#include "matrix/kaldi-vector-inl.h"
#endif // KALDI_MATRIX_KALDI_VECTOR_H_

@ -59,26 +59,7 @@ template<typename Real> class SubVector;
template<typename Real> class MatrixBase;
template<typename Real> class SubMatrix;
template<typename Real> class Matrix;
template<typename Real> class SpMatrix;
template<typename Real> class TpMatrix;
template<typename Real> class PackedMatrix;
template<typename Real> class SparseMatrix;
// these are classes that won't be defined in this
// directory; they're mostly needed for friend declarations.
template<typename Real> class CuMatrixBase;
template<typename Real> class CuSubMatrix;
template<typename Real> class CuMatrix;
template<typename Real> class CuVectorBase;
template<typename Real> class CuSubVector;
template<typename Real> class CuVector;
template<typename Real> class CuPackedMatrix;
template<typename Real> class CuSpMatrix;
template<typename Real> class CuTpMatrix;
template<typename Real> class CuSparseMatrix;
class CompressedMatrix;
class GeneralMatrix;
/// This class provides a way for switching between double and float types.
template<typename T> class OtherReal { }; // useful in reading+writing routines

@ -5,8 +5,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}
add_subdirectory(base)
add_subdirectory(util)
add_subdirectory(feat)
add_subdirectory(matrix)
add_subdirectory(lat)
add_subdirectory(fstext)
add_subdirectory(decoder)

@ -1,20 +0,0 @@
add_library(kaldi-mfcc
feature-mfcc.cc
)
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
add_library(kaldi-fbank
feature-fbank.cc
)
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
add_library(kaldi-feat-common
wave-reader.cc
signal.cc
feature-functions.cc
feature-window.cc
resample.cc
mel-computations.cc
cmvn.cc
)
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)

@ -1,183 +0,0 @@
// transform/cmvn.cc
// Copyright 2009-2013 Microsoft Corporation
// Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/cmvn.h"
namespace kaldi {
void InitCmvnStats(int32 dim, Matrix<double> *stats) {
KALDI_ASSERT(dim > 0);
stats->Resize(2, dim+1);
}
void AccCmvnStats(const VectorBase<BaseFloat> &feats, BaseFloat weight, MatrixBase<double> *stats) {
int32 dim = feats.Dim();
KALDI_ASSERT(stats != NULL);
KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() == dim + 1);
// Remove these __restrict__ modifiers if they cause compilation problems.
// It's just an optimization.
double *__restrict__ mean_ptr = stats->RowData(0),
*__restrict__ var_ptr = stats->RowData(1),
*__restrict__ count_ptr = mean_ptr + dim;
const BaseFloat * __restrict__ feats_ptr = feats.Data();
*count_ptr += weight;
// Careful-- if we change the format of the matrix, the "mean_ptr < count_ptr"
// statement below might become wrong.
for (; mean_ptr < count_ptr; mean_ptr++, var_ptr++, feats_ptr++) {
*mean_ptr += *feats_ptr * weight;
*var_ptr += *feats_ptr * *feats_ptr * weight;
}
}
void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
const VectorBase<BaseFloat> *weights,
MatrixBase<double> *stats) {
int32 num_frames = feats.NumRows();
if (weights != NULL) {
KALDI_ASSERT(weights->Dim() == num_frames);
}
for (int32 i = 0; i < num_frames; i++) {
SubVector<BaseFloat> this_frame = feats.Row(i);
BaseFloat weight = (weights == NULL ? 1.0 : (*weights)(i));
if (weight != 0.0)
AccCmvnStats(this_frame, weight, stats);
}
}
void ApplyCmvn(const MatrixBase<double> &stats,
bool var_norm,
MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(feats != NULL);
int32 dim = stats.NumCols() - 1;
if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
KALDI_ERR << "Dim mismatch: cmvn "
<< stats.NumRows() << 'x' << stats.NumCols()
<< ", feats " << feats->NumRows() << 'x' << feats->NumCols();
}
if (stats.NumRows() == 1 && var_norm)
KALDI_ERR << "You requested variance normalization but no variance stats "
<< "are supplied.";
double count = stats(0, dim);
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
// computing an offset and representing it as stats, we use a count of one.
if (count < 1.0)
KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
<< "count = " << count;
if (!var_norm) {
Vector<BaseFloat> offset(dim);
SubVector<double> mean_stats(stats.RowData(0), dim);
offset.AddVec(-1.0 / count, mean_stats);
feats->AddVecToRows(1.0, offset);
return;
}
// norm(0, d) = mean offset;
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
Matrix<BaseFloat> norm(2, dim);
for (int32 d = 0; d < dim; d++) {
double mean, offset, scale;
mean = stats(0, d)/count;
double var = (stats(1, d)/count) - mean*mean,
floor = 1.0e-20;
if (var < floor) {
KALDI_WARN << "Flooring cepstral variance from " << var << " to "
<< floor;
var = floor;
}
scale = 1.0 / sqrt(var);
if (scale != scale || 1/scale == 0.0)
KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
offset = -(mean*scale);
norm(0, d) = offset;
norm(1, d) = scale;
}
// Apply the normalization.
feats->MulColsVec(norm.Row(1));
feats->AddVecToRows(1.0, norm.Row(0));
}
void ApplyCmvnReverse(const MatrixBase<double> &stats,
bool var_norm,
MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(feats != NULL);
int32 dim = stats.NumCols() - 1;
if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
KALDI_ERR << "Dim mismatch: cmvn "
<< stats.NumRows() << 'x' << stats.NumCols()
<< ", feats " << feats->NumRows() << 'x' << feats->NumCols();
}
if (stats.NumRows() == 1 && var_norm)
KALDI_ERR << "You requested variance normalization but no variance stats "
<< "are supplied.";
double count = stats(0, dim);
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
// computing an offset and representing it as stats, we use a count of one.
if (count < 1.0)
KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
<< "count = " << count;
Matrix<BaseFloat> norm(2, dim); // norm(0, d) = mean offset
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
for (int32 d = 0; d < dim; d++) {
double mean, offset, scale;
mean = stats(0, d) / count;
if (!var_norm) {
scale = 1.0;
offset = mean;
} else {
double var = (stats(1, d)/count) - mean*mean,
floor = 1.0e-20;
if (var < floor) {
KALDI_WARN << "Flooring cepstral variance from " << var << " to "
<< floor;
var = floor;
}
// we aim to transform zero-mean, unit-variance input into data
// with the given mean and variance.
scale = sqrt(var);
offset = mean;
}
norm(0, d) = offset;
norm(1, d) = scale;
}
if (var_norm)
feats->MulColsVec(norm.Row(1));
feats->AddVecToRows(1.0, norm.Row(0));
}
void FakeStatsForSomeDims(const std::vector<int32> &dims,
MatrixBase<double> *stats) {
KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() > 1);
int32 dim = stats->NumCols() - 1;
double count = (*stats)(0, dim);
for (size_t i = 0; i < dims.size(); i++) {
int32 d = dims[i];
KALDI_ASSERT(d >= 0 && d < dim);
(*stats)(0, d) = 0.0;
(*stats)(1, d) = count;
}
}
} // namespace kaldi

@ -1,75 +0,0 @@
// transform/cmvn.h
// Copyright 2009-2013 Microsoft Corporation
// Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_TRANSFORM_CMVN_H_
#define KALDI_TRANSFORM_CMVN_H_
#include "base/kaldi-common.h"
#include "matrix/matrix-lib.h"
namespace kaldi {
/// This function initializes the matrix to dimension 2 by (dim+1);
/// 1st "dim" elements of 1st row are mean stats, 1st "dim" elements
/// of 2nd row are var stats, last element of 1st row is count,
/// last element of 2nd row is zero.
void InitCmvnStats(int32 dim, Matrix<double> *stats);
/// Accumulation from a single frame (weighted).
void AccCmvnStats(const VectorBase<BaseFloat> &feat,
BaseFloat weight,
MatrixBase<double> *stats);
/// Accumulation from a feature file (possibly weighted-- useful in excluding silence).
void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
const VectorBase<BaseFloat> *weights, // or NULL
MatrixBase<double> *stats);
/// Apply cepstral mean and variance normalization to a matrix of features.
/// If norm_vars == true, expects stats to be of dimension 2 by (dim+1), but
/// if norm_vars == false, will accept stats of dimension 1 by (dim+1); these
/// are produced by the balanced-cmvn code when it computes an offset and
/// represents it as "fake stats".
void ApplyCmvn(const MatrixBase<double> &stats,
bool norm_vars,
MatrixBase<BaseFloat> *feats);
/// This is as ApplyCmvn, but does so in the reverse sense, i.e. applies a transform
/// that would take zero-mean, unit-variance input and turn it into output with the
/// stats of "stats". This can be useful if you trained without CMVN but later want
/// to correct a mismatch, so you would first apply CMVN and then do the "reverse"
/// CMVN with the summed stats of your training data.
void ApplyCmvnReverse(const MatrixBase<double> &stats,
bool norm_vars,
MatrixBase<BaseFloat> *feats);
/// Modify the stats so that for some dimensions (specified in "dims"), we
/// replace them with "fake" stats that have zero mean and unit variance; this
/// is done to disable CMVN for those dimensions.
void FakeStatsForSomeDims(const std::vector<int32> &dims,
MatrixBase<double> *stats);
} // namespace kaldi
#endif // KALDI_TRANSFORM_CMVN_H_

@ -1,99 +0,0 @@
// feat/feature-common-inl.h
// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_
#define KALDI_FEAT_FEATURE_COMMON_INL_H_
#include "feat/resample.h"
// Do not include this file directly. It is included by feat/feature-common.h
namespace kaldi {
template <class F>
void OfflineFeatureTpl<F>::ComputeFeatures(
const VectorBase<BaseFloat> &wave,
BaseFloat sample_freq,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) {
KALDI_ASSERT(output != NULL);
BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
if (sample_freq == new_sample_freq) {
Compute(wave, vtln_warp, output);
} else {
if (new_sample_freq < sample_freq &&
! computer_.GetFrameOptions().allow_downsample)
KALDI_ERR << "Waveform and config sample Frequency mismatch: "
<< sample_freq << " .vs " << new_sample_freq
<< " (use --allow-downsample=true to allow "
<< " downsampling the waveform).";
else if (new_sample_freq > sample_freq &&
! computer_.GetFrameOptions().allow_upsample)
KALDI_ERR << "Waveform and config sample Frequency mismatch: "
<< sample_freq << " .vs " << new_sample_freq
<< " (use --allow-upsample=true option to allow "
<< " upsampling the waveform).";
// Resample the waveform.
Vector<BaseFloat> resampled_wave(wave);
ResampleWaveform(sample_freq, wave,
new_sample_freq, &resampled_wave);
Compute(resampled_wave, vtln_warp, output);
}
}
template <class F>
void OfflineFeatureTpl<F>::Compute(
const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) {
KALDI_ASSERT(output != NULL);
int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()),
cols_out = computer_.Dim();
if (rows_out == 0) {
output->Resize(0, 0);
return;
}
output->Resize(rows_out, cols_out);
Vector<BaseFloat> window; // windowed waveform.
bool use_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 r = 0; r < rows_out; r++) { // r is frame index.
BaseFloat raw_log_energy = 0.0;
ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
feature_window_function_, &window,
(use_raw_log_energy ? &raw_log_energy : NULL));
SubVector<BaseFloat> output_row(*output, r);
computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
}
}
template <class F>
void OfflineFeatureTpl<F>::Compute(
const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) const {
OfflineFeatureTpl<F> temp(*this);
// call the non-const version of Compute() on a temporary copy of this object.
// This is a workaround for const-ness that may sometimes be useful in
// multi-threaded code, although it's not optimally efficient.
temp.Compute(wave, vtln_warp, output);
}
} // end namespace kaldi
#endif

@ -1,176 +0,0 @@
// feat/feature-common.h
// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABILITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_COMMON_H_
#define KALDI_FEAT_FEATURE_COMMON_H_
#include <map>
#include <string>
#include "feat/feature-window.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// This class is only added for documentation, it is not intended to ever be
/// used.
struct ExampleFeatureComputerOptions {
FrameExtractionOptions frame_opts;
// .. more would go here.
};
/// This class is only added for documentation, it is not intended to ever be
/// used. It documents the interface of the *Computer classes which wrap the
/// low-level feature extraction. The template argument F of OfflineFeatureTpl must
/// follow this interface. This interface is intended for features such as
/// MFCCs and PLPs which can be computed frame by frame.
class ExampleFeatureComputer {
public:
typedef ExampleFeatureComputerOptions Options;
/// Returns a reference to the frame-extraction options class, which
/// will be part of our own options class.
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
/// Returns the feature dimension
int32 Dim() const;
/// Returns true if this function may inspect the raw log-energy of the signal
/// (before windowing and pre-emphasis); it's safe to always return true, but
/// setting it to false enables an optimization.
bool NeedRawLogEnergy() const { return true; }
/// constructor from options class; it should not store a reference or pointer
/// to the options class but should copy it.
explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts):
opts_(opts) { }
/// Copy constructor; all of these classes must have one.
ExampleFeatureComputer(const ExampleFeatureComputer &other);
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
private:
// disallow assignment.
ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in);
Options opts_;
};
/// This templated class is intended for offline feature extraction, i.e. where
/// you have access to the entire signal at the start. It exists mainly to be
/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for
/// use in the offline case. In April 2016 we reorganized the online
/// feature-computation code for greater modularity and to have correct support
/// for the snip-edges=false option.
template <class F>
class OfflineFeatureTpl {
public:
typedef typename F::Options Options;
// Note: feature_window_function_ is the windowing function, which initialized
// using the options class, that we cache at this level.
OfflineFeatureTpl(const Options &opts):
computer_(opts),
feature_window_function_(computer_.GetFrameOptions()) { }
// Internal (and back-compatibility) interface for computing features, which
// requires that the user has already checked that the sampling frequency
// of the waveform is equal to the sampling frequency specified in
// the frame-extraction options.
void Compute(const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output);
// This const version of Compute() is a wrapper that
// calls the non-const version on a temporary object.
// It's less efficient than the non-const version.
void Compute(const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) const;
/**
Computes the features for one file (one sequence of features).
This is the newer interface where you specify the sample frequency
of the input waveform.
@param [in] wave The input waveform
@param [in] sample_freq The sampling frequency with which
'wave' was sampled.
if sample_freq is higher than the frequency
specified in the config, we will downsample
the waveform, but if lower, it's an error.
@param [in] vtln_warp The VTLN warping factor (will normally
be 1.0)
@param [out] output The matrix of features, where the row-index
is the frame index.
*/
void ComputeFeatures(const VectorBase<BaseFloat> &wave,
BaseFloat sample_freq,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output);
int32 Dim() const { return computer_.Dim(); }
// Copy constructor.
OfflineFeatureTpl(const OfflineFeatureTpl<F> &other):
computer_(other.computer_),
feature_window_function_(other.feature_window_function_) { }
private:
// Disallow assignment.
OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
F computer_;
FeatureWindowFunction feature_window_function_;
};
/// @} End of "addtogroup feat"
} // namespace kaldi
#include "feat/feature-common-inl.h"
#endif // KALDI_FEAT_FEATURE_COMMON_H_

@ -1,125 +0,0 @@
// feat/feature-fbank.cc
// Copyright 2009-2012 Karel Vesely
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-fbank.h"
namespace kaldi {
FbankComputer::FbankComputer(const FbankOptions &opts):
opts_(opts), srfft_(NULL) {
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
// [note: this call caches it.]
GetMelBanks(1.0);
}
FbankComputer::FbankComputer(const FbankComputer &other):
opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
mel_banks_(other.mel_banks_), srfft_(NULL) {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end();
++iter)
iter->second = new MelBanks(*(iter->second));
if (other.srfft_)
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
}
FbankComputer::~FbankComputer() {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
delete iter->second;
delete srfft_;
}
const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
MelBanks *this_mel_banks = NULL;
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
if (iter == mel_banks_.end()) {
this_mel_banks = new MelBanks(opts_.mel_opts,
opts_.frame_opts,
vtln_warp);
mel_banks_[vtln_warp] = this_mel_banks;
} else {
this_mel_banks = iter->second;
}
return this_mel_banks;
}
void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
// Compute energy after window function (not the raw one).
if (opts_.use_energy && !opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon()));
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame);
SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
signal_frame->Dim() / 2 + 1);
// Use magnitude instead of power if requested.
if (!opts_.use_power)
power_spectrum.ApplyPow(0.5);
int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
SubVector<BaseFloat> mel_energies(*feature,
mel_offset,
opts_.mel_opts.num_bins);
// Sum with mel fiterbanks over the power spectrum
mel_banks.Compute(power_spectrum, &mel_energies);
if (opts_.use_log_fbank) {
// Avoid log of zero (which should be prevented anyway by dithering).
mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
mel_energies.ApplyLog(); // take the log.
}
// Copy energy as first value (or the last, if htk_compat == true).
if (opts_.use_energy) {
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
signal_raw_log_energy = log_energy_floor_;
}
int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
(*feature)(energy_index) = signal_raw_log_energy;
}
}
} // namespace kaldi

@ -1,149 +0,0 @@
// feat/feature-fbank.h
// Copyright 2009-2012 Karel Vesely
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_FBANK_H_
#define KALDI_FEAT_FEATURE_FBANK_H_
#include <map>
#include <string>
#include "feat/feature-common.h"
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// FbankOptions contains basic options for computing filterbank features.
/// It only includes things that can be done in a "stateless" way, i.e.
/// it does not include energy max-normalization.
/// It does not include delta computation.
struct FbankOptions {
FrameExtractionOptions frame_opts;
MelBanksOptions mel_opts;
bool use_energy; // append an extra dimension with energy to the filter banks
BaseFloat energy_floor;
bool raw_energy; // If true, compute energy before preemphasis and windowing
bool htk_compat; // If true, put energy last (if using energy)
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
bool use_power; // if true (default), use power in filterbank analysis, else magnitude.
FbankOptions(): mel_opts(23),
// defaults the #mel-banks to 23 for the FBANK computations.
// this seems to be common for 16khz-sampled data,
// but for 8khz-sampled data, 15 may be better.
use_energy(false),
energy_floor(0.0),
raw_energy(true),
htk_compat(false),
use_log_fbank(true),
use_power(true) {}
void Register(OptionsItf *opts) {
frame_opts.Register(opts);
mel_opts.Register(opts);
opts->Register("use-energy", &use_energy,
"Add an extra dimension with energy to the FBANK output.");
opts->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in FBANK computation. "
"Only makes a difference if --use-energy=true; only necessary if "
"--dither=0.0. Suggested values: 0.1 or 1.0");
opts->Register("raw-energy", &raw_energy,
"If true, compute energy before preemphasis and windowing");
opts->Register("htk-compat", &htk_compat, "If true, put energy last. "
"Warning: not sufficient to get HTK compatible features (need "
"to change other parameters).");
opts->Register("use-log-fbank", &use_log_fbank,
"If true, produce log-filterbank, else produce linear.");
opts->Register("use-power", &use_power,
"If true, use power, else use magnitude.");
}
};
/// Class for computing mel-filterbank features; see \ref feat_mfcc for more
/// information.
class FbankComputer {
public:
typedef FbankOptions Options;
explicit FbankComputer(const FbankOptions &opts);
FbankComputer(const FbankComputer &other);
int32 Dim() const {
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
}
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedsRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
~FbankComputer();
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
private:
FbankOptions opts_;
BaseFloat log_energy_floor_;
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
SplitRadixRealFft<BaseFloat> *srfft_;
// Disallow assignment.
FbankComputer &operator =(const FbankComputer &other);
};
typedef OfflineFeatureTpl<FbankComputer> Fbank;
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_FBANK_H_

@ -1,362 +0,0 @@
// feat/feature-functions.cc
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
// 2013 Johns Hopkins University (author: Daniel Povey)
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-functions.h"
#include "matrix/matrix-functions.h"
namespace kaldi {
void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
int32 dim = waveform->Dim();
// no, letting it be non-power-of-two for now.
// KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code
// does not require this (dan) but this is better in case we use different code [dan].
// RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here,
// as we just want power spectrum.
// now we have in waveform, first half of complex spectrum
// it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
int32 half_dim = dim/2;
BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
last_energy = (*waveform)(1) * (*waveform)(1); // handle this special case
for (int32 i = 1; i < half_dim; i++) {
BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1);
(*waveform)(i) = real*real + im*im;
}
(*waveform)(0) = first_energy;
(*waveform)(half_dim) = last_energy; // Will actually never be used, and anyway
// if the signal has been bandlimited sensibly this should be zero.
}
DeltaFeatures::DeltaFeatures(const DeltaFeaturesOptions &opts): opts_(opts) {
KALDI_ASSERT(opts.order >= 0 && opts.order < 1000); // just make sure we don't get binary junk.
// opts will normally be 2 or 3.
KALDI_ASSERT(opts.window > 0 && opts.window < 1000); // again, basic sanity check.
// normally the window size will be two.
scales_.resize(opts.order+1);
scales_[0].Resize(1);
scales_[0](0) = 1.0; // trivial window for 0th order delta [i.e. baseline feats]
for (int32 i = 1; i <= opts.order; i++) {
Vector<BaseFloat> &prev_scales = scales_[i-1],
&cur_scales = scales_[i];
int32 window = opts.window; // this code is designed to still
// work if instead we later make it an array and do opts.window[i-1],
// or something like that. "window" is a parameter specifying delta-window
// width which is actually 2*window + 1.
KALDI_ASSERT(window != 0);
int32 prev_offset = (static_cast<int32>(prev_scales.Dim()-1))/2,
cur_offset = prev_offset + window;
cur_scales.Resize(prev_scales.Dim() + 2*window); // also zeros it.
BaseFloat normalizer = 0.0;
for (int32 j = -window; j <= window; j++) {
normalizer += j*j;
for (int32 k = -prev_offset; k <= prev_offset; k++) {
cur_scales(j+k+cur_offset) +=
static_cast<BaseFloat>(j) * prev_scales(k+prev_offset);
}
}
cur_scales.Scale(1.0 / normalizer);
}
}
void DeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
VectorBase<BaseFloat> *output_frame) const {
KALDI_ASSERT(frame < input_feats.NumRows());
int32 num_frames = input_feats.NumRows(),
feat_dim = input_feats.NumCols();
KALDI_ASSERT(static_cast<int32>(output_frame->Dim()) == feat_dim * (opts_.order+1));
output_frame->SetZero();
for (int32 i = 0; i <= opts_.order; i++) {
const Vector<BaseFloat> &scales = scales_[i];
int32 max_offset = (scales.Dim() - 1) / 2;
SubVector<BaseFloat> output(*output_frame, i*feat_dim, feat_dim);
for (int32 j = -max_offset; j <= max_offset; j++) {
// if asked to read
int32 offset_frame = frame + j;
if (offset_frame < 0) offset_frame = 0;
else if (offset_frame >= num_frames)
offset_frame = num_frames - 1;
BaseFloat scale = scales(j + max_offset);
if (scale != 0.0)
output.AddVec(scale, input_feats.Row(offset_frame));
}
}
}
ShiftedDeltaFeatures::ShiftedDeltaFeatures(
const ShiftedDeltaFeaturesOptions &opts): opts_(opts) {
KALDI_ASSERT(opts.window > 0 && opts.window < 1000);
// Default window is 1.
int32 window = opts.window;
KALDI_ASSERT(window != 0);
scales_.Resize(1 + 2*window); // also zeros it.
BaseFloat normalizer = 0.0;
for (int32 j = -window; j <= window; j++) {
normalizer += j*j;
scales_(j + window) += static_cast<BaseFloat>(j);
}
scales_.Scale(1.0 / normalizer);
}
void ShiftedDeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
SubVector<BaseFloat> *output_frame) const {
KALDI_ASSERT(frame < input_feats.NumRows());
int32 num_frames = input_feats.NumRows(),
feat_dim = input_feats.NumCols();
KALDI_ASSERT(static_cast<int32>(output_frame->Dim())
== feat_dim * (opts_.num_blocks + 1));
output_frame->SetZero();
// The original features
SubVector<BaseFloat> output(*output_frame, 0, feat_dim);
output.AddVec(1.0, input_feats.Row(frame));
// Concatenate the delta-blocks. Each block is block_shift
// (usually 3) frames apart.
for (int32 i = 0; i < opts_.num_blocks; i++) {
int32 max_offset = (scales_.Dim() - 1) / 2;
SubVector<BaseFloat> output(*output_frame, (i + 1) * feat_dim, feat_dim);
for (int32 j = -max_offset; j <= max_offset; j++) {
int32 offset_frame = frame + j + i * opts_.block_shift;
if (offset_frame < 0) offset_frame = 0;
else if (offset_frame >= num_frames)
offset_frame = num_frames - 1;
BaseFloat scale = scales_(j + max_offset);
if (scale != 0.0)
output.AddVec(scale, input_feats.Row(offset_frame));
}
}
}
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features) {
output_features->Resize(input_features.NumRows(),
input_features.NumCols()
*(delta_opts.order + 1));
DeltaFeatures delta(delta_opts);
for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
SubVector<BaseFloat> row(*output_features, r);
delta.Process(input_features, r, &row);
}
}
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features) {
output_features->Resize(input_features.NumRows(),
input_features.NumCols()
* (delta_opts.num_blocks + 1));
ShiftedDeltaFeatures delta(delta_opts);
for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
SubVector<BaseFloat> row(*output_features, r);
delta.Process(input_features, r, &row);
}
}
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
mat_out->Resize(n_bases, dimension);
for (int32 i = 0; i < n_bases; i++) {
(*mat_out)(i, 0) = 1.0 * scale;
BaseFloat i_fl = static_cast<BaseFloat>(i);
for (int32 j = 1; j < dimension - 1; j++) {
BaseFloat j_fl = static_cast<BaseFloat>(j);
(*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl);
}
(*mat_out)(i, dimension -1)
= scale * cos(angle * i_fl * static_cast<BaseFloat>(dimension-1));
}
}
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
int32 left_context,
int32 right_context,
Matrix<BaseFloat> *output_features) {
int32 T = input_features.NumRows(), D = input_features.NumCols();
if (T == 0 || D == 0)
KALDI_ERR << "SpliceFrames: empty input";
KALDI_ASSERT(left_context >= 0 && right_context >= 0);
int32 N = 1 + left_context + right_context;
output_features->Resize(T, D*N);
for (int32 t = 0; t < T; t++) {
SubVector<BaseFloat> dst_row(*output_features, t);
for (int32 j = 0; j < N; j++) {
int32 t2 = t + j - left_context;
if (t2 < 0) t2 = 0;
if (t2 >= T) t2 = T-1;
SubVector<BaseFloat> dst(dst_row, j*D, D),
src(input_features, t2);
dst.CopyFromVec(src);
}
}
}
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features) {
int32 T = input_features.NumRows(), D = input_features.NumCols();
if (T == 0 || D == 0)
KALDI_ERR << "ReverseFrames: empty input";
output_features->Resize(T, D);
for (int32 t = 0; t < T; t++) {
SubVector<BaseFloat> dst_row(*output_features, t);
SubVector<BaseFloat> src_row(input_features, T-1-t);
dst_row.CopyFromVec(src_row);
}
}
void SlidingWindowCmnOptions::Check() const {
KALDI_ASSERT(cmn_window > 0);
if (center)
KALDI_ASSERT(min_window > 0 && min_window <= cmn_window);
// else ignored so value doesn't matter.
}
// Internal version of SlidingWindowCmn with double-precision arguments.
void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts,
const MatrixBase<double> &input,
MatrixBase<double> *output) {
opts.Check();
int32 num_frames = input.NumRows(), dim = input.NumCols(),
last_window_start = -1, last_window_end = -1,
warning_count = 0;
Vector<double> cur_sum(dim), cur_sumsq(dim);
for (int32 t = 0; t < num_frames; t++) {
int32 window_start, window_end; // note: window_end will be one
// past the end of the window we use for normalization.
if (opts.center) {
window_start = t - (opts.cmn_window / 2);
window_end = window_start + opts.cmn_window;
} else {
window_start = t - opts.cmn_window;
window_end = t + 1;
}
if (window_start < 0) { // shift window right if starts <0.
window_end -= window_start;
window_start = 0; // or: window_start -= window_start
}
if (!opts.center) {
if (window_end > t)
window_end = std::max(t + 1, opts.min_window);
}
if (window_end > num_frames) {
window_start -= (window_end - num_frames);
window_end = num_frames;
if (window_start < 0) window_start = 0;
}
if (last_window_start == -1) {
SubMatrix<double> input_part(input,
window_start, window_end - window_start,
0, dim);
cur_sum.AddRowSumMat(1.0, input_part , 0.0);
if (opts.normalize_variance)
cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0);
} else {
if (window_start > last_window_start) {
KALDI_ASSERT(window_start == last_window_start + 1);
SubVector<double> frame_to_remove(input, last_window_start);
cur_sum.AddVec(-1.0, frame_to_remove);
if (opts.normalize_variance)
cur_sumsq.AddVec2(-1.0, frame_to_remove);
}
if (window_end > last_window_end) {
KALDI_ASSERT(window_end == last_window_end + 1);
SubVector<double> frame_to_add(input, last_window_end);
cur_sum.AddVec(1.0, frame_to_add);
if (opts.normalize_variance)
cur_sumsq.AddVec2(1.0, frame_to_add);
}
}
int32 window_frames = window_end - window_start;
last_window_start = window_start;
last_window_end = window_end;
KALDI_ASSERT(window_frames > 0);
SubVector<double> input_frame(input, t),
output_frame(*output, t);
output_frame.CopyFromVec(input_frame);
output_frame.AddVec(-1.0 / window_frames, cur_sum);
if (opts.normalize_variance) {
if (window_frames == 1) {
output_frame.Set(0.0);
} else {
Vector<double> variance(cur_sumsq);
variance.Scale(1.0 / window_frames);
variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum);
// now "variance" is the variance of the features in the window,
// around their own mean.
int32 num_floored;
variance.ApplyFloor(1.0e-10, &num_floored);
if (num_floored > 0 && num_frames > 1) {
if (opts.max_warnings == warning_count) {
KALDI_WARN << "Suppressing the remaining variance flooring "
<< "warnings. Run program with --max-warnings=-1 to "
<< "see all warnings.";
}
// If opts.max_warnings is a negative number, we won't restrict the
// number of times that the warning is printed out.
else if (opts.max_warnings < 0
|| opts.max_warnings > warning_count) {
KALDI_WARN << "Flooring when normalizing variance, floored "
<< num_floored << " elements; num-frames was "
<< window_frames;
}
warning_count++;
}
variance.ApplyPow(-0.5); // get inverse standard deviation.
output_frame.MulElements(variance);
}
}
}
}
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output) {
KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0);
Matrix<double> input_dbl(input), output_dbl(input.NumRows(), input.NumCols());
// call double-precision version
SlidingWindowCmnInternal(opts, input_dbl, &output_dbl);
output->CopyFromMat(output_dbl);
}
} // namespace kaldi

@ -1,204 +0,0 @@
// feat/feature-functions.h
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_
#define KALDI_FEAT_FEATURE_FUNCTIONS_H_
#include <string>
#include <vector>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
// functions in matrix/matrix-functions.h), and converts it into
// a power spectrum. If the complex FFT is a vector of size n (representing
// half the complex FFT of a real signal of size n, as described there),
// this function computes in the first (n/2) + 1 elements of it, the
// energies of the fft bins from zero to the Nyquist frequency. Contents of the
// remaining (n/2) - 1 elements are undefined at output.
void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
struct DeltaFeaturesOptions {
int32 order;
int32 window; // e.g. 2; controls window size (window size is 2*window + 1)
// the behavior at the edges is to replicate the first or last frame.
// this is not configurable.
DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
order(order), window(window) { }
void Register(OptionsItf *opts) {
opts->Register("delta-order", &order, "Order of delta computation");
opts->Register("delta-window", &window,
"Parameter controlling window for delta computation (actual window"
" size for each delta order is 1 + 2*delta-window-size)");
}
};
class DeltaFeatures {
public:
// This class provides a low-level function to compute delta features.
// The function takes as input a matrix of features and a frame index
// that it should compute the deltas on. It puts its output in an object
// of type VectorBase, of size (original-feature-dimension) * (opts.order+1).
// This is not the most efficient way to do the computation, but it's
// state-free and thus easier to understand
explicit DeltaFeatures(const DeltaFeaturesOptions &opts);
void Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
VectorBase<BaseFloat> *output_frame) const;
private:
DeltaFeaturesOptions opts_;
std::vector<Vector<BaseFloat> > scales_; // a scaling window for each
// of the orders, including zero: multiply the features for each
// dimension by this window.
};
struct ShiftedDeltaFeaturesOptions {
int32 window, // The time delay and advance
num_blocks,
block_shift; // Distance between consecutive blocks
ShiftedDeltaFeaturesOptions():
window(1), num_blocks(7), block_shift(3) { }
void Register(OptionsItf *opts) {
opts->Register("delta-window", &window, "Size of delta advance and delay.");
opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance"
" of each frame to be concatenated");
opts->Register("block-shift", &block_shift, "Distance between each block");
}
};
class ShiftedDeltaFeatures {
public:
// This class provides a low-level function to compute shifted
// delta cesptra (SDC).
// The function takes as input a matrix of features and a frame index
// that it should compute the deltas on. It puts its output in an object
// of type VectorBase, of size original-feature-dimension + (1 * num_blocks).
explicit ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts);
void Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
SubVector<BaseFloat> *output_frame) const;
private:
ShiftedDeltaFeaturesOptions opts_;
Vector<BaseFloat> scales_; // a scaling window for each
};
// ComputeDeltas is a convenience function that computes deltas on a feature
// file. If you want to deal with features coming in bit by bit you would have
// to use the DeltaFeatures class directly, and do the computation frame by
// frame. Later we will have to come up with a nice mechanism to do this for
// features coming in.
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
// ComputeShiftedDeltas computes deltas from a feature file by applying
// ShiftedDeltaFeatures over the frames. This function is provided for
// convenience, however, ShiftedDeltaFeatures can be used directly.
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
// SpliceFrames will normally be used together with LDA.
// It splices frames together to make a window. At the
// start and end of an utterance, it duplicates the first
// and last frames.
// Will throw if input features are empty.
// left_context and right_context must be nonnegative.
// these both represent a number of frames (e.g. 4, 4 is
// a good choice).
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
int32 left_context,
int32 right_context,
Matrix<BaseFloat> *output_features);
// ReverseFrames reverses the frames in time (used for backwards decoding)
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
// This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which
// is online CMN with no latency, for online speech recognition.
struct SlidingWindowCmnOptions {
int32 cmn_window;
int32 min_window;
int32 max_warnings;
bool normalize_variance;
bool center;
SlidingWindowCmnOptions():
cmn_window(600),
min_window(100),
max_warnings(5),
normalize_variance(false),
center(false) { }
void Register(OptionsItf *opts) {
opts->Register("cmn-window", &cmn_window, "Window in frames for running "
"average CMN computation");
opts->Register("min-cmn-window", &min_window, "Minimum CMN window "
"used at start of decoding (adds latency only at start). "
"Only applicable if center == false, ignored if center==true");
opts->Register("max-warnings", &max_warnings, "Maximum warnings to report "
"per utterance. 0 to disable, -1 to show all.");
opts->Register("norm-vars", &normalize_variance, "If true, normalize "
"variance to one."); // naming this as in apply-cmvn.cc
opts->Register("center", &center, "If true, use a window centered on the "
"current frame (to the extent possible, modulo end effects). "
"If false, window is to the left.");
}
void Check() const;
};
/// Applies sliding-window cepstral mean and/or variance normalization. See the
/// strings registering the options in the options class for information on how
/// this works and what the options are. input and output must have the same
/// dimension.
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output);
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_FUNCTIONS_H_

@ -1,157 +0,0 @@
// feat/feature-mfcc.cc
// Copyright 2009-2011 Karel Vesely; Petr Motlicek
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-mfcc.h"
namespace kaldi {
void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
if (opts_.use_energy && !opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon()));
if (srfft_ != NULL) // Compute FFT using the split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame);
SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
signal_frame->Dim() / 2 + 1);
mel_banks.Compute(power_spectrum, &mel_energies_);
// avoid log of zero (which should be prevented anyway by dithering).
mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
mel_energies_.ApplyLog(); // take the log.
feature->SetZero(); // in case there were NaNs.
// feature = dct_matrix_ * mel_energies [which now have log]
feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
if (opts_.cepstral_lifter != 0.0)
feature->MulElements(lifter_coeffs_);
if (opts_.use_energy) {
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
signal_raw_log_energy = log_energy_floor_;
(*feature)(0) = signal_raw_log_energy;
}
if (opts_.htk_compat) {
BaseFloat energy = (*feature)(0);
for (int32 i = 0; i < opts_.num_ceps - 1; i++)
(*feature)(i) = (*feature)(i+1);
if (!opts_.use_energy)
energy *= M_SQRT2; // scale on C0 (actually removing a scale
// we previously added that's part of one common definition of
// the cosine transform.)
(*feature)(opts_.num_ceps - 1) = energy;
}
}
MfccComputer::MfccComputer(const MfccOptions &opts):
opts_(opts), srfft_(NULL),
mel_energies_(opts.mel_opts.num_bins) {
int32 num_bins = opts.mel_opts.num_bins;
if (opts.num_ceps > num_bins)
KALDI_ERR << "num-ceps cannot be larger than num-mel-bins."
<< " It should be smaller or equal. You provided num-ceps: "
<< opts.num_ceps << " and num-mel-bins: "
<< num_bins;
Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
ComputeDctMatrix(&dct_matrix);
// Note that we include zeroth dct in either case. If using the
// energy we replace this with the energy. This means a different
// ordering of features than HTK.
SubMatrix<BaseFloat> dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins);
dct_matrix_.Resize(opts.num_ceps, num_bins);
dct_matrix_.CopyFromMat(dct_rows); // subset of rows.
if (opts.cepstral_lifter != 0.0) {
lifter_coeffs_.Resize(opts.num_ceps);
ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
}
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
// [note: this call caches it.]
GetMelBanks(1.0);
}
MfccComputer::MfccComputer(const MfccComputer &other):
opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
dct_matrix_(other.dct_matrix_),
log_energy_floor_(other.log_energy_floor_),
mel_banks_(other.mel_banks_),
srfft_(NULL),
mel_energies_(other.mel_energies_.Dim(), kUndefined) {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
iter->second = new MelBanks(*(iter->second));
if (other.srfft_ != NULL)
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
}
MfccComputer::~MfccComputer() {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end();
++iter)
delete iter->second;
delete srfft_;
}
const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) {
MelBanks *this_mel_banks = NULL;
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
if (iter == mel_banks_.end()) {
this_mel_banks = new MelBanks(opts_.mel_opts,
opts_.frame_opts,
vtln_warp);
mel_banks_[vtln_warp] = this_mel_banks;
} else {
this_mel_banks = iter->second;
}
return this_mel_banks;
}
} // namespace kaldi

@ -1,154 +0,0 @@
// feat/feature-mfcc.h
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University
// 2014-2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_MFCC_H_
#define KALDI_FEAT_FEATURE_MFCC_H_
#include <map>
#include <string>
#include "feat/feature-common.h"
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// MfccOptions contains basic options for computing MFCC features.
struct MfccOptions {
FrameExtractionOptions frame_opts;
MelBanksOptions mel_opts;
int32 num_ceps; // e.g. 13: num cepstral coeffs, counting zero.
bool use_energy; // use energy; else C0
BaseFloat energy_floor; // 0 by default; set to a value like 1.0 or 0.1 if
// you disable dithering.
bool raw_energy; // If true, compute energy before preemphasis and windowing
BaseFloat cepstral_lifter; // Scaling factor on cepstra for HTK compatibility.
// if 0.0, no liftering is done.
bool htk_compat; // if true, put energy/C0 last and introduce a factor of
// sqrt(2) on C0 to be the same as HTK.
MfccOptions() : mel_opts(23),
// defaults the #mel-banks to 23 for the MFCC computations.
// this seems to be common for 16khz-sampled data,
// but for 8khz-sampled data, 15 may be better.
num_ceps(13),
use_energy(true),
energy_floor(0.0),
raw_energy(true),
cepstral_lifter(22.0),
htk_compat(false) {}
void Register(OptionsItf *opts) {
frame_opts.Register(opts);
mel_opts.Register(opts);
opts->Register("num-ceps", &num_ceps,
"Number of cepstra in MFCC computation (including C0)");
opts->Register("use-energy", &use_energy,
"Use energy (not C0) in MFCC computation");
opts->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in MFCC computation. "
"Only makes a difference if --use-energy=true; only necessary if "
"--dither=0.0. Suggested values: 0.1 or 1.0");
opts->Register("raw-energy", &raw_energy,
"If true, compute energy before preemphasis and windowing");
opts->Register("cepstral-lifter", &cepstral_lifter,
"Constant that controls scaling of MFCCs");
opts->Register("htk-compat", &htk_compat,
"If true, put energy or C0 last and use a factor of sqrt(2) on "
"C0. Warning: not sufficient to get HTK compatible features "
"(need to change other parameters).");
}
};
// This is the new-style interface to the MFCC computation.
class MfccComputer {
public:
typedef MfccOptions Options;
explicit MfccComputer(const MfccOptions &opts);
MfccComputer(const MfccComputer &other);
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
int32 Dim() const { return opts_.num_ceps; }
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedsRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
~MfccComputer();
private:
// disallow assignment.
MfccComputer &operator = (const MfccComputer &in);
protected:
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
MfccOptions opts_;
Vector<BaseFloat> lifter_coeffs_;
Matrix<BaseFloat> dct_matrix_; // matrix we left-multiply by to perform DCT.
BaseFloat log_energy_floor_;
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
SplitRadixRealFft<BaseFloat> *srfft_;
// note: mel_energies_ is specific to the frame we're processing, it's
// just a temporary workspace.
Vector<BaseFloat> mel_energies_;
};
typedef OfflineFeatureTpl<MfccComputer> Mfcc;
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_MFCC_H_

@ -1,191 +0,0 @@
// feat/feature-plp.cc
// Copyright 2009-2011 Petr Motlicek; Karel Vesely
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-plp.h"
namespace kaldi {
PlpComputer::PlpComputer(const PlpOptions &opts):
opts_(opts), srfft_(NULL),
mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
lpc_coeffs_(opts_.lpc_order, kUndefined),
raw_cepstrum_(opts_.lpc_order, kUndefined) {
if (opts.cepstral_lifter != 0.0) {
lifter_coeffs_.Resize(opts.num_ceps);
ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
}
InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2,
&idft_bases_);
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
// [note: this call caches it.]
GetMelBanks(1.0);
}
PlpComputer::PlpComputer(const PlpComputer &other):
opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
srfft_(NULL),
mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
lpc_coeffs_(opts_.lpc_order, kUndefined),
raw_cepstrum_(opts_.lpc_order, kUndefined) {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
iter->second = new MelBanks(*(iter->second));
for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
iter = equal_loudness_.begin();
iter != equal_loudness_.end(); ++iter)
iter->second = new Vector<BaseFloat>(*(iter->second));
if (other.srfft_ != NULL)
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
}
PlpComputer::~PlpComputer() {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
delete iter->second;
for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
iter = equal_loudness_.begin();
iter != equal_loudness_.end(); ++iter)
delete iter->second;
delete srfft_;
}
const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
MelBanks *this_mel_banks = NULL;
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
if (iter == mel_banks_.end()) {
this_mel_banks = new MelBanks(opts_.mel_opts,
opts_.frame_opts,
vtln_warp);
mel_banks_[vtln_warp] = this_mel_banks;
} else {
this_mel_banks = iter->second;
}
return this_mel_banks;
}
const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
Vector<BaseFloat> *ans = NULL;
std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
= equal_loudness_.find(vtln_warp);
if (iter == equal_loudness_.end()) {
ans = new Vector<BaseFloat>;
GetEqualLoudnessVector(*this_mel_banks, ans);
equal_loudness_[vtln_warp] = ans;
} else {
ans = iter->second;
}
return ans;
}
void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1); // our num-ceps includes C0.
if (opts_.use_energy && !opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::min()));
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame); // elements 0 ... signal_frame->Dim()/2
SubVector<BaseFloat> power_spectrum(*signal_frame,
0, signal_frame->Dim() / 2 + 1);
int32 num_mel_bins = opts_.mel_opts.num_bins;
SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
mel_banks.Compute(power_spectrum, &mel_energies);
mel_energies.MulElements(equal_loudness);
mel_energies.ApplyPow(opts_.compress_factor);
// duplicate first and last elements
mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
mel_energies_duplicated_(num_mel_bins + 1) =
mel_energies_duplicated_(num_mel_bins);
autocorr_coeffs_.SetZero(); // In case of NaNs or infs
autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
mel_energies_duplicated_, 0.0);
BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
residual_log_energy = std::max<BaseFloat>(residual_log_energy,
std::numeric_limits<float>::min());
Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
raw_cepstrum_.Range(0, opts_.num_ceps - 1));
(*feature)(0) = residual_log_energy;
if (opts_.cepstral_lifter != 0.0)
feature->MulElements(lifter_coeffs_);
if (opts_.cepstral_scale != 1.0)
feature->Scale(opts_.cepstral_scale);
if (opts_.use_energy) {
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
signal_raw_log_energy = log_energy_floor_;
(*feature)(0) = signal_raw_log_energy;
}
if (opts_.htk_compat) { // reorder the features.
BaseFloat log_energy = (*feature)(0);
for (int32 i = 0; i < opts_.num_ceps-1; i++)
(*feature)(i) = (*feature)(i+1);
(*feature)(opts_.num_ceps-1) = log_energy;
}
}
} // namespace kaldi

@ -1,176 +0,0 @@
// feat/feature-plp.h
// Copyright 2009-2011 Petr Motlicek; Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_PLP_H_
#define KALDI_FEAT_FEATURE_PLP_H_
#include <map>
#include <string>
#include "feat/feature-common.h"
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
#include "util/options-itf.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// PlpOptions contains basic options for computing PLP features.
/// It only includes things that can be done in a "stateless" way, i.e.
/// it does not include energy max-normalization.
/// It does not include delta computation.
struct PlpOptions {
FrameExtractionOptions frame_opts;
MelBanksOptions mel_opts;
int32 lpc_order;
int32 num_ceps; // num cepstra including zero
bool use_energy; // use energy; else C0
BaseFloat energy_floor;
bool raw_energy; // If true, compute energy before preemphasis and windowing
BaseFloat compress_factor;
int32 cepstral_lifter;
BaseFloat cepstral_scale;
bool htk_compat; // if true, put energy/C0 last and introduce a factor of
// sqrt(2) on C0 to be the same as HTK.
PlpOptions() : mel_opts(23),
// default number of mel-banks for the PLP computation; this
// seems to be common for 16kHz-sampled data. For 8kHz-sampled
// data, 15 may be better.
lpc_order(12),
num_ceps(13),
use_energy(true),
energy_floor(0.0),
raw_energy(true),
compress_factor(0.33333),
cepstral_lifter(22),
cepstral_scale(1.0),
htk_compat(false) {}
void Register(OptionsItf *opts) {
frame_opts.Register(opts);
mel_opts.Register(opts);
opts->Register("lpc-order", &lpc_order,
"Order of LPC analysis in PLP computation");
opts->Register("num-ceps", &num_ceps,
"Number of cepstra in PLP computation (including C0)");
opts->Register("use-energy", &use_energy,
"Use energy (not C0) for zeroth PLP feature");
opts->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in PLP computation. "
"Only makes a difference if --use-energy=true; only necessary if "
"--dither=0.0. Suggested values: 0.1 or 1.0");
opts->Register("raw-energy", &raw_energy,
"If true, compute energy before preemphasis and windowing");
opts->Register("compress-factor", &compress_factor,
"Compression factor in PLP computation");
opts->Register("cepstral-lifter", &cepstral_lifter,
"Constant that controls scaling of PLPs");
opts->Register("cepstral-scale", &cepstral_scale,
"Scaling constant in PLP computation");
opts->Register("htk-compat", &htk_compat,
"If true, put energy or C0 last. Warning: not sufficient "
"to get HTK compatible features (need to change other "
"parameters).");
}
};
/// This is the new-style interface to the PLP computation.
class PlpComputer {
public:
typedef PlpOptions Options;
explicit PlpComputer(const PlpOptions &opts);
PlpComputer(const PlpComputer &other);
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
int32 Dim() const { return opts_.num_ceps; }
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedsRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
~PlpComputer();
private:
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp);
PlpOptions opts_;
Vector<BaseFloat> lifter_coeffs_;
Matrix<BaseFloat> idft_bases_;
BaseFloat log_energy_floor_;
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
std::map<BaseFloat, Vector<BaseFloat>* > equal_loudness_;
SplitRadixRealFft<BaseFloat> *srfft_;
// temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2
Vector<BaseFloat> mel_energies_duplicated_;
// temporary vector used inside Compute; size is opts_.lpc_order + 1
Vector<BaseFloat> autocorr_coeffs_;
// temporary vector used inside Compute; size is opts_.lpc_order
Vector<BaseFloat> lpc_coeffs_;
// temporary vector used inside Compute; size is opts_.lpc_order
Vector<BaseFloat> raw_cepstrum_;
// Disallow assignment.
PlpComputer &operator =(const PlpComputer &other);
};
typedef OfflineFeatureTpl<PlpComputer> Plp;
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_PLP_H_

@ -1,82 +0,0 @@
// feat/feature-spectrogram.cc
// Copyright 2009-2012 Karel Vesely
// Copyright 2012 Navdeep Jaitly
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-spectrogram.h"
namespace kaldi {
SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts)
: opts_(opts), srfft_(NULL) {
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
}
SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other):
opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) {
if (other.srfft_ != NULL)
srfft_ = new SplitRadixRealFft<BaseFloat>(*other.srfft_);
}
SpectrogramComputer::~SpectrogramComputer() {
delete srfft_;
}
void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
// Compute energy after window function (not the raw one)
if (!opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon()));
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame);
SubVector<BaseFloat> power_spectrum(*signal_frame,
0, signal_frame->Dim() / 2 + 1);
power_spectrum.ApplyFloor(std::numeric_limits<float>::epsilon());
power_spectrum.ApplyLog();
feature->CopyFromVec(power_spectrum);
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
signal_raw_log_energy = log_energy_floor_;
// The zeroth spectrogram component is always set to the signal energy,
// instead of the square of the constant component of the signal.
(*feature)(0) = signal_raw_log_energy;
}
} // namespace kaldi

@ -1,117 +0,0 @@
// feat/feature-spectrogram.h
// Copyright 2009-2012 Karel Vesely
// Copyright 2012 Navdeep Jaitly
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_SPECTROGRAM_H_
#define KALDI_FEAT_FEATURE_SPECTROGRAM_H_
#include <string>
#include "feat/feature-common.h"
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// SpectrogramOptions contains basic options for computing spectrogram
/// features.
struct SpectrogramOptions {
FrameExtractionOptions frame_opts;
BaseFloat energy_floor;
bool raw_energy; // If true, compute energy before preemphasis and windowing
SpectrogramOptions() :
energy_floor(0.0),
raw_energy(true) {}
void Register(OptionsItf *opts) {
frame_opts.Register(opts);
opts->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in Spectrogram "
"computation. Caution: this floor is applied to the zeroth "
"component, representing the total signal energy. The "
"floor on the individual spectrogram elements is fixed at "
"std::numeric_limits<float>::epsilon().");
opts->Register("raw-energy", &raw_energy,
"If true, compute energy before preemphasis and windowing");
}
};
/// Class for computing spectrogram features.
class SpectrogramComputer {
public:
typedef SpectrogramOptions Options;
explicit SpectrogramComputer(const SpectrogramOptions &opts);
SpectrogramComputer(const SpectrogramComputer &other);
const FrameExtractionOptions& GetFrameOptions() const {
return opts_.frame_opts;
}
int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
bool NeedRawLogEnergy() const { return opts_.raw_energy; }
/**
Function that computes one frame of spectrogram features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedsRawLogEnergy().
@param [in] vtln_warp This is ignored by this function, it's only
needed for interface compatibility.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
~SpectrogramComputer();
private:
SpectrogramOptions opts_;
BaseFloat log_energy_floor_;
SplitRadixRealFft<BaseFloat> *srfft_;
// Disallow assignment.
SpectrogramComputer &operator=(const SpectrogramComputer &other);
};
typedef OfflineFeatureTpl<SpectrogramComputer> Spectrogram;
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_SPECTROGRAM_H_

@ -1,222 +0,0 @@
// feat/feature-window.cc
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
// 2013-2016 Johns Hopkins University (author: Daniel Povey)
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-window.h"
#include "matrix/matrix-functions.h"
namespace kaldi {
int64 FirstSampleOfFrame(int32 frame,
const FrameExtractionOptions &opts) {
int64 frame_shift = opts.WindowShift();
if (opts.snip_edges) {
return frame * frame_shift;
} else {
int64 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
return beginning_of_frame;
}
}
int32 NumFrames(int64 num_samples,
const FrameExtractionOptions &opts,
bool flush) {
int64 frame_shift = opts.WindowShift();
int64 frame_length = opts.WindowSize();
if (opts.snip_edges) {
// with --snip-edges=true (the default), we use a HTK-like approach to
// determining the number of frames-- all frames have to fit completely into
// the waveform, and the first frame begins at sample zero.
if (num_samples < frame_length)
return 0;
else
return (1 + ((num_samples - frame_length) / frame_shift));
// You can understand the expression above as follows: 'num_samples -
// frame_length' is how much room we have to shift the frame within the
// waveform; 'frame_shift' is how much we shift it each time; and the ratio
// is how many times we can shift it (integer arithmetic rounds down).
} else {
// if --snip-edges=false, the number of frames is determined by rounding the
// (file-length / frame-shift) to the nearest integer. The point of this
// formula is to make the number of frames an obvious and predictable
// function of the frame shift and signal length, which makes many
// segmentation-related questions simpler.
//
// Because integer division in C++ rounds toward zero, we add (half the
// frame-shift minus epsilon) before dividing, to have the effect of
// rounding towards the closest integer.
int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
if (flush)
return num_frames;
// note: 'end' always means the last plus one, i.e. one past the last.
int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
+ frame_length;
// the following code is optimized more for clarity than efficiency.
// If flush == false, we can't output frames that extend past the end
// of the signal.
while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
num_frames--;
end_sample_of_last_frame -= frame_shift;
}
return num_frames;
}
}
void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
if (dither_value == 0.0)
return;
int32 dim = waveform->Dim();
BaseFloat *data = waveform->Data();
RandomState rstate;
for (int32 i = 0; i < dim; i++)
data[i] += RandGauss(&rstate) * dither_value;
}
void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
if (preemph_coeff == 0.0) return;
KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
for (int32 i = waveform->Dim()-1; i > 0; i--)
(*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
(*waveform)(0) -= preemph_coeff * (*waveform)(0);
}
FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
int32 frame_length = opts.WindowSize();
KALDI_ASSERT(frame_length > 0);
window.Resize(frame_length);
double a = M_2PI / (frame_length-1);
for (int32 i = 0; i < frame_length; i++) {
double i_fl = static_cast<double>(i);
if (opts.window_type == "hanning") {
window(i) = 0.5 - 0.5*cos(a * i_fl);
} else if (opts.window_type == "hamming") {
window(i) = 0.54 - 0.46*cos(a * i_fl);
} else if (opts.window_type == "povey") { // like hamming but goes to zero at edges.
window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
} else if (opts.window_type == "rectangular") {
window(i) = 1.0;
} else if (opts.window_type == "blackman") {
window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
(0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
} else {
KALDI_ERR << "Invalid window type " << opts.window_type;
}
}
}
void ProcessWindow(const FrameExtractionOptions &opts,
const FeatureWindowFunction &window_function,
VectorBase<BaseFloat> *window,
BaseFloat *log_energy_pre_window) {
int32 frame_length = opts.WindowSize();
KALDI_ASSERT(window->Dim() == frame_length);
if (opts.dither != 0.0)
Dither(window, opts.dither);
if (opts.remove_dc_offset)
window->Add(-window->Sum() / frame_length);
if (log_energy_pre_window != NULL) {
BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
std::numeric_limits<float>::epsilon());
*log_energy_pre_window = Log(energy);
}
if (opts.preemph_coeff != 0.0)
Preemphasize(window, opts.preemph_coeff);
window->MulElements(window_function.window);
}
// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
// padded size. It does mean subtraction, pre-emphasis and dithering as
// requested.
void ExtractWindow(int64 sample_offset,
const VectorBase<BaseFloat> &wave,
int32 f, // with 0 <= f < NumFrames(feats, opts)
const FrameExtractionOptions &opts,
const FeatureWindowFunction &window_function,
Vector<BaseFloat> *window,
BaseFloat *log_energy_pre_window) {
KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
int32 frame_length = opts.WindowSize(),
frame_length_padded = opts.PaddedWindowSize();
int64 num_samples = sample_offset + wave.Dim(),
start_sample = FirstSampleOfFrame(f, opts),
end_sample = start_sample + frame_length;
if (opts.snip_edges) {
KALDI_ASSERT(start_sample >= sample_offset &&
end_sample <= num_samples);
} else {
KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
}
if (window->Dim() != frame_length_padded)
window->Resize(frame_length_padded, kUndefined);
// wave_start and wave_end are start and end indexes into 'wave', for the
// piece of wave that we're trying to extract.
int32 wave_start = int32(start_sample - sample_offset),
wave_end = wave_start + frame_length;
if (wave_start >= 0 && wave_end <= wave.Dim()) {
// the normal case-- no edge effects to consider.
window->Range(0, frame_length).CopyFromVec(
wave.Range(wave_start, frame_length));
} else {
// Deal with any end effects by reflection, if needed. This code will only
// be reached for about two frames per utterance, so we don't concern
// ourselves excessively with efficiency.
int32 wave_dim = wave.Dim();
for (int32 s = 0; s < frame_length; s++) {
int32 s_in_wave = s + wave_start;
while (s_in_wave < 0 || s_in_wave >= wave_dim) {
// reflect around the beginning or end of the wave.
// e.g. -1 -> 0, -2 -> 1.
// dim -> dim - 1, dim + 1 -> dim - 2.
// the code supports repeated reflections, although this
// would only be needed in pathological cases.
if (s_in_wave < 0) s_in_wave = - s_in_wave - 1;
else s_in_wave = 2 * wave_dim - 1 - s_in_wave;
}
(*window)(s) = wave(s_in_wave);
}
}
if (frame_length_padded > frame_length)
window->Range(frame_length, frame_length_padded - frame_length).SetZero();
SubVector<BaseFloat> frame(*window, 0, frame_length);
ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
}
} // namespace kaldi

@ -1,223 +0,0 @@
// feat/feature-window.h
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University
// 2014-2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_WINDOW_H_
#define KALDI_FEAT_FEATURE_WINDOW_H_
#include <map>
#include <string>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
struct FrameExtractionOptions {
BaseFloat samp_freq;
BaseFloat frame_shift_ms; // in milliseconds.
BaseFloat frame_length_ms; // in milliseconds.
BaseFloat dither; // Amount of dithering, 0.0 means no dither.
BaseFloat preemph_coeff; // Preemphasis coefficient.
bool remove_dc_offset; // Subtract mean of wave before FFT.
std::string window_type; // e.g. Hamming window
// May be "hamming", "rectangular", "povey", "hanning", "blackman"
// "povey" is a window I made to be similar to Hamming but to go to zero at the
// edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
// I just don't think the Hamming window makes sense as a windowing function.
bool round_to_power_of_two;
BaseFloat blackman_coeff;
bool snip_edges;
bool allow_downsample;
bool allow_upsample;
int max_feature_vectors;
FrameExtractionOptions():
samp_freq(16000),
frame_shift_ms(10.0),
frame_length_ms(25.0),
dither(1.0),
preemph_coeff(0.97),
remove_dc_offset(true),
window_type("povey"),
round_to_power_of_two(true),
blackman_coeff(0.42),
snip_edges(true),
allow_downsample(false),
allow_upsample(false),
max_feature_vectors(-1)
{ }
void Register(OptionsItf *opts) {
opts->Register("sample-frequency", &samp_freq,
"Waveform data sample frequency (must match the waveform file, "
"if specified there)");
opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
opts->Register("preemphasis-coefficient", &preemph_coeff,
"Coefficient for use in signal preemphasis");
opts->Register("remove-dc-offset", &remove_dc_offset,
"Subtract mean from waveform on each frame");
opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). "
"If you turn this off, you should set the --energy-floor "
"option, e.g. to 1.0 or 0.1");
opts->Register("window-type", &window_type, "Type of window "
"(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
"|\"blackmann\")");
opts->Register("blackman-coeff", &blackman_coeff,
"Constant coefficient for generalized Blackman window.");
opts->Register("round-to-power-of-two", &round_to_power_of_two,
"If true, round window size to power of two by zero-padding "
"input to FFT.");
opts->Register("snip-edges", &snip_edges,
"If true, end effects will be handled by outputting only frames that "
"completely fit in the file, and the number of frames depends on the "
"frame-length. If false, the number of frames depends only on the "
"frame-shift, and we reflect the data at the ends.");
opts->Register("allow-downsample", &allow_downsample,
"If true, allow the input waveform to have a higher frequency than "
"the specified --sample-frequency (and we'll downsample).");
opts->Register("max-feature-vectors", &max_feature_vectors,
"Memory optimization. If larger than 0, periodically remove feature "
"vectors so that only this number of the latest feature vectors is "
"retained.");
opts->Register("allow-upsample", &allow_upsample,
"If true, allow the input waveform to have a lower frequency than "
"the specified --sample-frequency (and we'll upsample).");
}
int32 WindowShift() const {
return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
}
int32 WindowSize() const {
return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
}
int32 PaddedWindowSize() const {
return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
WindowSize());
}
};
struct FeatureWindowFunction {
FeatureWindowFunction() {}
explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
FeatureWindowFunction(const FeatureWindowFunction &other):
window(other.window) { }
Vector<BaseFloat> window;
};
/**
This function returns the number of frames that we can extract from a wave
file with the given number of samples in it (assumed to have the same
sampling rate as specified in 'opts').
@param [in] num_samples The number of samples in the wave file.
@param [in] opts The frame-extraction options class
@param [in] flush True if we are asserting that this number of samples is
'all there is', false if we expecting more data to possibly come
in. This only makes a difference to the answer if opts.snips_edges
== false. For offline feature extraction you always want flush ==
true. In an online-decoding context, once you know (or decide) that
no more data is coming in, you'd call it with flush == true at the
end to flush out any remaining data.
*/
int32 NumFrames(int64 num_samples,
const FrameExtractionOptions &opts,
bool flush = true);
/*
This function returns the index of the first sample of the frame indexed
'frame'. If snip-edges=true, it just returns frame * opts.WindowShift(); if
snip-edges=false, the formula is a little more complicated and the result may
be negative.
*/
int64 FirstSampleOfFrame(int32 frame,
const FrameExtractionOptions &opts);
void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
/**
This function does all the windowing steps after actually
extracting the windowed signal: depending on the
configuration, it does dithering, dc offset removal,
preemphasis, and multiplication by the windowing function.
@param [in] opts The options class to be used
@param [in] window_function The windowing function-- should have
been initialized using 'opts'.
@param [in,out] window A vector of size opts.WindowSize(). Note:
it will typically be a sub-vector of a larger vector of size
opts.PaddedWindowSize(), with the remaining samples zero,
as the FFT code is more efficient if it operates on data with
power-of-two size.
@param [out] log_energy_pre_window If non-NULL, then after dithering and
DC offset removal, this function will write to this pointer the log of
the total energy (i.e. sum-squared) of the frame.
*/
void ProcessWindow(const FrameExtractionOptions &opts,
const FeatureWindowFunction &window_function,
VectorBase<BaseFloat> *window,
BaseFloat *log_energy_pre_window = NULL);
/*
ExtractWindow() extracts a windowed frame of waveform (possibly with a
power-of-two, padded size, depending on the config), including all the
proessing done by ProcessWindow().
@param [in] sample_offset If 'wave' is not the entire waveform, but
part of it to the left has been discarded, then the
number of samples prior to 'wave' that we have
already discarded. Set this to zero if you are
processing the entire waveform in one piece, or
if you get 'no matching function' compilation
errors when updating the code.
@param [in] wave The waveform
@param [in] f The frame index to be extracted, with
0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
@param [in] opts The options class to be used
@param [in] window_function The windowing function, as derived from the
options class.
@param [out] window The windowed, possibly-padded waveform to be
extracted. Will be resized as needed.
@param [out] log_energy_pre_window If non-NULL, the log-energy of
the signal prior to pre-emphasis and multiplying by
the windowing function will be written to here.
*/
void ExtractWindow(int64 sample_offset,
const VectorBase<BaseFloat> &wave,
int32 f,
const FrameExtractionOptions &opts,
const FeatureWindowFunction &window_function,
Vector<BaseFloat> *window,
BaseFloat *log_energy_pre_window = NULL);
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_WINDOW_H_

@ -1,340 +0,0 @@
// feat/mel-computations.cc
// Copyright 2009-2011 Phonexia s.r.o.; Karel Vesely; Microsoft Corporation
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <algorithm>
#include <iostream>
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
namespace kaldi {
MelBanks::MelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
BaseFloat vtln_warp_factor):
htk_mode_(opts.htk_mode) {
int32 num_bins = opts.num_bins;
if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
BaseFloat sample_freq = frame_opts.samp_freq;
int32 window_length_padded = frame_opts.PaddedWindowSize();
KALDI_ASSERT(window_length_padded % 2 == 0);
int32 num_fft_bins = window_length_padded / 2;
BaseFloat nyquist = 0.5 * sample_freq;
BaseFloat low_freq = opts.low_freq, high_freq;
if (opts.high_freq > 0.0)
high_freq = opts.high_freq;
else
high_freq = nyquist + opts.high_freq;
if (low_freq < 0.0 || low_freq >= nyquist
|| high_freq <= 0.0 || high_freq > nyquist
|| high_freq <= low_freq)
KALDI_ERR << "Bad values in options: low-freq " << low_freq
<< " and high-freq " << high_freq << " vs. nyquist "
<< nyquist;
BaseFloat fft_bin_width = sample_freq / window_length_padded;
// fft-bin width [think of it as Nyquist-freq / half-window-length]
BaseFloat mel_low_freq = MelScale(low_freq);
BaseFloat mel_high_freq = MelScale(high_freq);
debug_ = opts.debug_mel;
// divide by num_bins+1 in next line because of end-effects where the bins
// spread out to the sides.
BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1);
BaseFloat vtln_low = opts.vtln_low,
vtln_high = opts.vtln_high;
if (vtln_high < 0.0) {
vtln_high += nyquist;
}
if (vtln_warp_factor != 1.0 &&
(vtln_low < 0.0 || vtln_low <= low_freq
|| vtln_low >= high_freq
|| vtln_high <= 0.0 || vtln_high >= high_freq
|| vtln_high <= vtln_low))
KALDI_ERR << "Bad values in options: vtln-low " << vtln_low
<< " and vtln-high " << vtln_high << ", versus "
<< "low-freq " << low_freq << " and high-freq "
<< high_freq;
bins_.resize(num_bins);
center_freqs_.Resize(num_bins);
for (int32 bin = 0; bin < num_bins; bin++) {
BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta,
center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
if (vtln_warp_factor != 1.0) {
left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, left_mel);
center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, center_mel);
right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, right_mel);
}
center_freqs_(bin) = InverseMelScale(center_mel);
// this_bin will be a vector of coefficients that is only
// nonzero where this mel bin is active.
Vector<BaseFloat> this_bin(num_fft_bins);
int32 first_index = -1, last_index = -1;
for (int32 i = 0; i < num_fft_bins; i++) {
BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft
// bin.
BaseFloat mel = MelScale(freq);
if (mel > left_mel && mel < right_mel) {
BaseFloat weight;
if (mel <= center_mel)
weight = (mel - left_mel) / (center_mel - left_mel);
else
weight = (right_mel-mel) / (right_mel-center_mel);
this_bin(i) = weight;
if (first_index == -1)
first_index = i;
last_index = i;
}
}
//KALDI_ASSERT(first_index != -1 && last_index >= first_index
// && "You may have set --num-mel-bins too large.");
bins_[bin].first = first_index;
int32 size = last_index + 1 - first_index;
bins_[bin].second.Resize(size);
bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
// Replicate a bug in HTK, for testing purposes.
if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
bins_[bin].second(0) = 0.0;
}
if (debug_) {
for (size_t i = 0; i < bins_.size(); i++) {
KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
<< ", vec = " << bins_[i].second;
}
}
}
MelBanks::MelBanks(const MelBanks &other):
center_freqs_(other.center_freqs_),
bins_(other.bins_),
debug_(other.debug_),
htk_mode_(other.htk_mode_) { }
BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
BaseFloat vtln_high_cutoff,
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
BaseFloat high_freq,
BaseFloat vtln_warp_factor,
BaseFloat freq) {
/// This computes a VTLN warping function that is not the same as HTK's one,
/// but has similar inputs (this function has the advantage of never producing
/// empty bins).
/// This function computes a warp function F(freq), defined between low_freq and
/// high_freq inclusive, with the following properties:
/// F(low_freq) == low_freq
/// F(high_freq) == high_freq
/// The function is continuous and piecewise linear with two inflection
/// points.
/// The lower inflection point (measured in terms of the unwarped
/// frequency) is at frequency l, determined as described below.
/// The higher inflection point is at a frequency h, determined as
/// described below.
/// If l <= f <= h, then F(f) = f/vtln_warp_factor.
/// If the higher inflection point (measured in terms of the unwarped
/// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
/// Since (by the last point) F(h) == h/vtln_warp_factor, then
/// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
/// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
/// = vtln_high_cutoff * min(1, vtln_warp_factor).
/// If the lower inflection point (measured in terms of the unwarped
/// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
/// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
/// = vtln_low_cutoff * max(1, vtln_warp_factor)
if (freq < low_freq || freq > high_freq) return freq; // in case this gets called
// for out-of-range frequencies, just return the freq.
KALDI_ASSERT(vtln_low_cutoff > low_freq &&
"be sure to set the --vtln-low option higher than --low-freq");
KALDI_ASSERT(vtln_high_cutoff < high_freq &&
"be sure to set the --vtln-high option lower than --high-freq [or negative]");
BaseFloat one = 1.0;
BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
BaseFloat scale = 1.0 / vtln_warp_factor;
BaseFloat Fl = scale * l; // F(l);
BaseFloat Fh = scale * h; // F(h);
KALDI_ASSERT(l > low_freq && h < high_freq);
// slope of left part of the 3-piece linear function
BaseFloat scale_left = (Fl - low_freq) / (l - low_freq);
// [slope of center part is just "scale"]
// slope of right part of the 3-piece linear function
BaseFloat scale_right = (high_freq - Fh) / (high_freq - h);
if (freq < l) {
return low_freq + scale_left * (freq - low_freq);
} else if (freq < h) {
return scale * freq;
} else { // freq >= h
return high_freq + scale_right * (freq - high_freq);
}
}
BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
BaseFloat vtln_high_cutoff,
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
BaseFloat high_freq,
BaseFloat vtln_warp_factor,
BaseFloat mel_freq) {
return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq,
vtln_warp_factor, InverseMelScale(mel_freq)));
}
// "power_spectrum" contains fft energies.
void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
VectorBase<BaseFloat> *mel_energies_out) const {
int32 num_bins = bins_.size();
KALDI_ASSERT(mel_energies_out->Dim() == num_bins);
for (int32 i = 0; i < num_bins; i++) {
int32 offset = bins_[i].first;
const Vector<BaseFloat> &v(bins_[i].second);
BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
// HTK-like flooring- for testing purposes (we prefer dither)
if (htk_mode_ && energy < 1.0) energy = 1.0;
(*mel_energies_out)(i) = energy;
// The following assert was added due to a problem with OpenBlas that
// we had at one point (it was a bug in that library). Just to detect
// it early.
KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
}
if (debug_) {
fprintf(stderr, "MEL BANKS:\n");
for (int32 i = 0; i < num_bins; i++)
fprintf(stderr, " %f", (*mel_energies_out)(i));
fprintf(stderr, "\n");
}
}
void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
// Compute liftering coefficients (scaling on cepstral coeffs)
// coeffs are numbered slightly differently from HTK: the zeroth
// index is C0, which is not affected.
for (int32 i = 0; i < coeffs->Dim(); i++)
(*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
}
// Durbin's recursion - converts autocorrelation coefficients to the LPC
// pTmp - temporal place [n]
// pAC - autocorrelation coefficients [n + 1]
// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
// F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
BaseFloat ki; // reflection coefficient
int i;
int j;
BaseFloat E = pAC[0];
for (i = 0; i < n; i++) {
// next reflection coefficient
ki = pAC[i + 1];
for (j = 0; j < i; j++)
ki += pLP[j] * pAC[i - j];
ki = ki / E;
// new error
BaseFloat c = 1 - ki * ki;
if (c < 1.0e-5) // remove NaNs for constan signal
c = 1.0e-5;
E *= c;
// new LP coefficients
pTmp[i] = -ki;
for (j = 0; j < i; j++)
pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
for (j = 0; j <= i; j++)
pLP[j] = pTmp[j];
}
return E;
}
void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
for (int32 i = 0; i < n; i++) {
double sum = 0.0;
int j;
for (j = 0; j < i; j++) {
sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
}
pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
}
}
void GetEqualLoudnessVector(const MelBanks &mel_banks,
Vector<BaseFloat> *ans) {
int32 n = mel_banks.NumBins();
// Central frequency of each mel bin.
const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
ans->Resize(n);
for (int32 i = 0; i < n; i++) {
BaseFloat fsq = f0(i) * f0(i);
BaseFloat fsub = fsq / (fsq + 1.6e5);
(*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
}
}
// Compute LP coefficients from autocorrelation coefficients.
BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
Vector<BaseFloat> *lpc_out) {
int32 n = autocorr_in.Dim() - 1;
KALDI_ASSERT(lpc_out->Dim() == n);
Vector<BaseFloat> tmp(n);
BaseFloat ans = Durbin(n, autocorr_in.Data(),
lpc_out->Data(),
tmp.Data());
if (ans <= 0.0)
KALDI_WARN << "Zero energy in LPC computation";
return -Log(1.0 / ans); // forms the C0 value
}
} // namespace kaldi

@ -1,171 +0,0 @@
// feat/mel-computations.h
// Copyright 2009-2011 Phonexia s.r.o.; Microsoft Corporation
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_MEL_COMPUTATIONS_H_
#define KALDI_FEAT_MEL_COMPUTATIONS_H_
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <utility>
#include <vector>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "matrix/matrix-lib.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
struct FrameExtractionOptions; // defined in feature-window.h
struct MelBanksOptions {
int32 num_bins; // e.g. 25; number of triangular bins
BaseFloat low_freq; // e.g. 20; lower frequency cutoff
BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative
// ->added to the Nyquist frequency to get the cutoff.
BaseFloat vtln_low; // vtln lower cutoff of warping function.
BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added
// to the Nyquist frequency to get the cutoff.
bool debug_mel;
// htk_mode is a "hidden" config, it does not show up on command line.
// Enables more exact compatibility with HTK, for testing purposes. Affects
// mel-energy flooring and reproduces a bug in HTK.
bool htk_mode;
explicit MelBanksOptions(int num_bins = 25)
: num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
vtln_high(-500), debug_mel(false), htk_mode(false) {}
void Register(OptionsItf *opts) {
opts->Register("num-mel-bins", &num_bins,
"Number of triangular mel-frequency bins");
opts->Register("low-freq", &low_freq,
"Low cutoff frequency for mel bins");
opts->Register("high-freq", &high_freq,
"High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
opts->Register("vtln-low", &vtln_low,
"Low inflection point in piecewise linear VTLN warping function");
opts->Register("vtln-high", &vtln_high,
"High inflection point in piecewise linear VTLN warping function"
" (if negative, offset from high-mel-freq");
opts->Register("debug-mel", &debug_mel,
"Print out debugging information for mel bin computation");
}
};
class MelBanks {
public:
static inline BaseFloat InverseMelScale(BaseFloat mel_freq) {
return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f);
}
static inline BaseFloat MelScale(BaseFloat freq) {
return 1127.0f * logf (1.0f + freq / 700.0f);
}
static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff,
BaseFloat vtln_high_cutoff, // discontinuities in warp func
BaseFloat low_freq,
BaseFloat high_freq, // upper+lower frequency cutoffs in
// the mel computation
BaseFloat vtln_warp_factor,
BaseFloat freq);
static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,
BaseFloat vtln_high_cutoff,
BaseFloat low_freq,
BaseFloat high_freq,
BaseFloat vtln_warp_factor,
BaseFloat mel_freq);
MelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
BaseFloat vtln_warp_factor);
/// Compute Mel energies (note: not log enerties).
/// At input, "fft_energies" contains the FFT energies (not log).
void Compute(const VectorBase<BaseFloat> &fft_energies,
VectorBase<BaseFloat> *mel_energies_out) const;
int32 NumBins() const { return bins_.size(); }
// returns vector of central freq of each bin; needed by plp code.
const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
const std::vector<std::pair<int32, Vector<BaseFloat> > >& GetBins() const {
return bins_;
}
// Copy constructor
MelBanks(const MelBanks &other);
private:
// Disallow assignment
MelBanks &operator = (const MelBanks &other);
// center frequencies of bins, numbered from 0 ... num_bins-1.
// Needed by GetCenterFreqs().
Vector<BaseFloat> center_freqs_;
// the "bins_" vector is a vector, one for each bin, of a pair:
// (the first nonzero fft-bin), (the vector of weights).
std::vector<std::pair<int32, Vector<BaseFloat> > > bins_;
bool debug_;
bool htk_mode_;
};
// Compute liftering coefficients (scaling on cepstral coeffs)
// coeffs are numbered slightly differently from HTK: the zeroth
// index is C0, which is not affected.
void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
// Durbin's recursion - converts autocorrelation coefficients to the LPC
// pTmp - temporal place [n]
// pAC - autocorrelation coefficients [n + 1]
// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
// F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
// Returns log energy of residual (I think)
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
// Compute LP coefficients from autocorrelation coefficients.
// Returns log energy of residual (I think)
BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
Vector<BaseFloat> *lpc_out);
void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
void GetEqualLoudnessVector(const MelBanks &mel_banks,
Vector<BaseFloat> *ans);
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_MEL_COMPUTATIONS_H_

@ -1,125 +0,0 @@
// feat/online-feature-itf.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_ONLINE_FEATURE_ITF_H_
#define KALDI_FEAT_ONLINE_FEATURE_ITF_H_ 1
#include "base/kaldi-common.h"
#include "matrix/matrix-lib.h"
namespace kaldi {
/// @ingroup Interfaces
/// @{
/**
OnlineFeatureInterface is an interface for online feature processing (it is
also usable in the offline setting, but currently we're not using it for
that). This is for use in the online2/ directory, and it supersedes the
interface in ../online/online-feat-input.h. We have a slightly different
model that puts more control in the hands of the calling thread, and won't
involve waiting on semaphores in the decoding thread.
This interface only specifies how the object *outputs* the features.
How it obtains the features, e.g. from a previous object or objects of type
OnlineFeatureInterface, is not specified in the interface and you will
likely define new constructors or methods in the derived type to do that.
You should appreciate that this interface is designed to allow random
access to features, as long as they are ready. That is, the user
can call GetFrame for any frame less than NumFramesReady(), and when
implementing a child class you must not make assumptions about the
order in which the user makes these calls.
*/
class OnlineFeatureInterface {
public:
virtual int32 Dim() const = 0; /// returns the feature dimension.
/// Returns the total number of frames, since the start of the utterance, that
/// are now available. In an online-decoding context, this will likely
/// increase with time as more data becomes available.
virtual int32 NumFramesReady() const = 0;
/// Returns true if this is the last frame. Frame indices are zero-based, so the
/// first frame is zero. IsLastFrame(-1) will return false, unless the file
/// is empty (which is a case that I'm not sure all the code will handle, so
/// be careful). This function may return false for some frame if
/// we haven't yet decided to terminate decoding, but later true if we decide
/// to terminate decoding. This function exists mainly to correctly handle
/// end effects in feature extraction, and is not a mechanism to determine how
/// many frames are in the decodable object (as it used to be, and for backward
/// compatibility, still is, in the Decodable interface).
virtual bool IsLastFrame(int32 frame) const = 0;
/// Gets the feature vector for this frame. Before calling this for a given
/// frame, it is assumed that you called NumFramesReady() and it returned a
/// number greater than "frame". Otherwise this call will likely crash with
/// an assert failure. This function is not declared const, in case there is
/// some kind of caching going on, but most of the time it shouldn't modify
/// the class.
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
/// This is like GetFrame() but for a collection of frames. There is a
/// default implementation that just gets the frames one by one, but it
/// may be overridden for efficiency by child classes (since sometimes
/// it's more efficient to do things in a batch).
virtual void GetFrames(const std::vector<int32> &frames,
MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
for (size_t i = 0; i < frames.size(); i++) {
SubVector<BaseFloat> feat(*feats, i);
GetFrame(frames[i], &feat);
}
}
// Returns frame shift in seconds. Helps to estimate duration from frame
// counts.
virtual BaseFloat FrameShiftInSeconds() const = 0;
/// Virtual destructor. Note: constructors that take another member of
/// type OnlineFeatureInterface are not expected to take ownership of
/// that pointer; the caller needs to keep track of that manually.
virtual ~OnlineFeatureInterface() { }
};
/// Add a virtual class for "source" features such as MFCC or PLP or pitch
/// features.
class OnlineBaseFeature: public OnlineFeatureInterface {
public:
/// This would be called from the application, when you get more wave data.
/// Note: the sampling_rate is typically only provided so the code can assert
/// that it matches the sampling rate expected in the options.
virtual void AcceptWaveform(BaseFloat sampling_rate,
const VectorBase<BaseFloat> &waveform) = 0;
/// InputFinished() tells the class you won't be providing any
/// more waveform. This will help flush out the last few frames
/// of delta or LDA features (it will typically affect the return value
/// of IsLastFrame.
virtual void InputFinished() = 0;
};
/// @}
} // namespace Kaldi
#endif // KALDI_ITF_ONLINE_FEATURE_ITF_H_

@ -1,679 +0,0 @@
// feat/online-feature.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// 2014 Yanqing Sun, Junjie Wang,
// Daniel Povey, Korbinian Riedhammer
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/online-feature.h"
#include "transform/cmvn.h"
namespace kaldi {
RecyclingVector::RecyclingVector(int items_to_hold):
items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
first_available_index_(0) {
}
RecyclingVector::~RecyclingVector() {
for (auto *item : items_) {
delete item;
}
}
Vector<BaseFloat> *RecyclingVector::At(int index) const {
if (index < first_available_index_) {
KALDI_ERR << "Attempted to retrieve feature vector that was "
"already removed by the RecyclingVector (index = "
<< index << "; "
<< "first_available_index = " << first_available_index_ << "; "
<< "size = " << Size() << ")";
}
// 'at' does size checking.
return items_.at(index - first_available_index_);
}
void RecyclingVector::PushBack(Vector<BaseFloat> *item) {
if (items_.size() == items_to_hold_) {
delete items_.front();
items_.pop_front();
++first_available_index_;
}
items_.push_back(item);
}
int RecyclingVector::Size() const {
return first_available_index_ + items_.size();
}
template <class C>
void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
VectorBase<BaseFloat> *feat) {
feat->CopyFromVec(*(features_.At(frame)));
};
template <class C>
OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
const typename C::Options &opts):
computer_(opts), window_function_(computer_.GetFrameOptions()),
features_(opts.frame_opts.max_feature_vectors),
input_finished_(false), waveform_offset_(0) {
// RE the following assert: search for ONLINE_IVECTOR_LIMIT in
// online-ivector-feature.cc.
// Casting to uint32, an unsigned type, means that -1 would be treated
// as `very large`.
KALDI_ASSERT(static_cast<uint32>(opts.frame_opts.max_feature_vectors) > 200);
}
template <class C>
void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
BaseFloat sampling_rate) {
BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
if (resampler_ != nullptr) {
KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
} else if (((sampling_rate < expected_sampling_rate) &&
computer_.GetFrameOptions().allow_downsample) ||
((sampling_rate > expected_sampling_rate) &&
computer_.GetFrameOptions().allow_upsample)) {
resampler_.reset(new LinearResample(
sampling_rate, expected_sampling_rate,
std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
} else if (sampling_rate != expected_sampling_rate) {
KALDI_ERR << "Sampling frequency mismatch, expected "
<< expected_sampling_rate << ", got " << sampling_rate
<< "\nPerhaps you want to use the options "
"--allow_{upsample,downsample}";
}
}
template <class C>
void OnlineGenericBaseFeature<C>::InputFinished() {
if (resampler_ != nullptr) {
// There may be a few samples left once we flush the resampler_ object, telling it
// that the file has finished. This should rarely make any difference.
Vector<BaseFloat> appended_wave;
Vector<BaseFloat> resampled_wave;
resampler_->Resample(appended_wave, true, &resampled_wave);
if (resampled_wave.Dim() != 0) {
appended_wave.Resize(waveform_remainder_.Dim() +
resampled_wave.Dim());
if (waveform_remainder_.Dim() != 0)
appended_wave.Range(0, waveform_remainder_.Dim())
.CopyFromVec(waveform_remainder_);
appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
.CopyFromVec(resampled_wave);
waveform_remainder_.Swap(&appended_wave);
}
}
input_finished_ = true;
ComputeFeatures();
}
template <class C>
void OnlineGenericBaseFeature<C>::AcceptWaveform(
BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
if (original_waveform.Dim() == 0)
return; // Nothing to do.
if (input_finished_)
KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
Vector<BaseFloat> appended_wave;
Vector<BaseFloat> resampled_wave;
const VectorBase<BaseFloat> *waveform;
MaybeCreateResampler(sampling_rate);
if (resampler_ == nullptr) {
waveform = &original_waveform;
} else {
resampler_->Resample(original_waveform, false, &resampled_wave);
waveform = &resampled_wave;
}
appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
if (waveform_remainder_.Dim() != 0)
appended_wave.Range(0, waveform_remainder_.Dim())
.CopyFromVec(waveform_remainder_);
appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
.CopyFromVec(*waveform);
waveform_remainder_.Swap(&appended_wave);
ComputeFeatures();
}
template <class C>
void OnlineGenericBaseFeature<C>::ComputeFeatures() {
const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
int32 num_frames_old = features_.Size(),
num_frames_new = NumFrames(num_samples_total, frame_opts,
input_finished_);
KALDI_ASSERT(num_frames_new >= num_frames_old);
Vector<BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 frame = num_frames_old; frame < num_frames_new; frame++) {
BaseFloat raw_log_energy = 0.0;
ExtractWindow(waveform_offset_, waveform_remainder_, frame,
frame_opts, window_function_, &window,
need_raw_log_energy ? &raw_log_energy : NULL);
Vector<BaseFloat> *this_feature = new Vector<BaseFloat>(computer_.Dim(),
kUndefined);
// note: this online feature-extraction code does not support VTLN.
BaseFloat vtln_warp = 1.0;
computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
features_.PushBack(this_feature);
}
// OK, we will now discard any portion of the signal that will not be
// necessary to compute frames in the future.
int64 first_sample_of_next_frame = FirstSampleOfFrame(num_frames_new,
frame_opts);
int32 samples_to_discard = first_sample_of_next_frame - waveform_offset_;
if (samples_to_discard > 0) {
// discard the leftmost part of the waveform that we no longer need.
int32 new_num_samples = waveform_remainder_.Dim() - samples_to_discard;
if (new_num_samples <= 0) {
// odd, but we'll try to handle it.
waveform_offset_ += waveform_remainder_.Dim();
waveform_remainder_.Resize(0);
} else {
Vector<BaseFloat> new_remainder(new_num_samples);
new_remainder.CopyFromVec(waveform_remainder_.Range(samples_to_discard,
new_num_samples));
waveform_offset_ += samples_to_discard;
waveform_remainder_.Swap(&new_remainder);
}
}
}
// instantiate the templates defined here for MFCC, PLP and filterbank classes.
template class OnlineGenericBaseFeature<MfccComputer>;
template class OnlineGenericBaseFeature<PlpComputer>;
template class OnlineGenericBaseFeature<FbankComputer>;
OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
speaker_cmvn_stats(other.speaker_cmvn_stats),
global_cmvn_stats(other.global_cmvn_stats),
frozen_state(other.frozen_state) { }
void OnlineCmvnState::Write(std::ostream &os, bool binary) const {
WriteToken(os, binary, "<OnlineCmvnState>"); // magic string.
WriteToken(os, binary, "<SpeakerCmvnStats>");
speaker_cmvn_stats.Write(os, binary);
WriteToken(os, binary, "<GlobalCmvnStats>");
global_cmvn_stats.Write(os, binary);
WriteToken(os, binary, "<FrozenState>");
frozen_state.Write(os, binary);
WriteToken(os, binary, "</OnlineCmvnState>");
}
void OnlineCmvnState::Read(std::istream &is, bool binary) {
ExpectToken(is, binary, "<OnlineCmvnState>"); // magic string.
ExpectToken(is, binary, "<SpeakerCmvnStats>");
speaker_cmvn_stats.Read(is, binary);
ExpectToken(is, binary, "<GlobalCmvnStats>");
global_cmvn_stats.Read(is, binary);
ExpectToken(is, binary, "<FrozenState>");
frozen_state.Read(is, binary);
ExpectToken(is, binary, "</OnlineCmvnState>");
}
OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
const OnlineCmvnState &cmvn_state,
OnlineFeatureInterface *src):
opts_(opts), temp_stats_(2, src->Dim() + 1),
temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
src_(src) {
SetState(cmvn_state);
if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
<< "integers)";
}
OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
OnlineFeatureInterface *src):
opts_(opts), temp_stats_(2, src->Dim() + 1),
temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
src_(src) {
if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
<< "integers)";
}
void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
int32 *cached_frame,
MatrixBase<double> *stats) {
KALDI_ASSERT(frame >= 0);
InitRingBufferIfNeeded();
// look for a cached frame on a previous frame as close as possible in time
// to "frame". Return if we get one.
for (int32 t = frame; t >= 0 && t >= frame - opts_.ring_buffer_size; t--) {
if (t % opts_.modulus == 0) {
// if this frame should be cached in cached_stats_modulo_, then
// we'll look there, and we won't go back any further in time.
break;
}
int32 index = t % opts_.ring_buffer_size;
if (cached_stats_ring_[index].first == t) {
*cached_frame = t;
stats->CopyFromMat(cached_stats_ring_[index].second);
return;
}
}
int32 n = frame / opts_.modulus;
if (n >= cached_stats_modulo_.size()) {
if (cached_stats_modulo_.size() == 0) {
*cached_frame = -1;
stats->SetZero();
return;
} else {
n = static_cast<int32>(cached_stats_modulo_.size() - 1);
}
}
*cached_frame = n * opts_.modulus;
KALDI_ASSERT(cached_stats_modulo_[n] != NULL);
stats->CopyFromMat(*(cached_stats_modulo_[n]));
}
// Initialize ring buffer for caching stats.
void OnlineCmvn::InitRingBufferIfNeeded() {
if (cached_stats_ring_.empty() && opts_.ring_buffer_size > 0) {
Matrix<double> temp(2, this->Dim() + 1);
cached_stats_ring_.resize(opts_.ring_buffer_size,
std::pair<int32, Matrix<double> >(-1, temp));
}
}
void OnlineCmvn::CacheFrame(int32 frame, const MatrixBase<double> &stats) {
KALDI_ASSERT(frame >= 0);
if (frame % opts_.modulus == 0) { // store in cached_stats_modulo_.
int32 n = frame / opts_.modulus;
if (n >= cached_stats_modulo_.size()) {
// The following assert is a limitation on in what order you can call
// CacheFrame. Fortunately the calling code always calls it in sequence,
// which it has to because you need a previous frame to compute the
// current one.
KALDI_ASSERT(n == cached_stats_modulo_.size());
cached_stats_modulo_.push_back(new Matrix<double>(stats));
} else {
KALDI_WARN << "Did not expect to reach this part of code.";
// do what seems right, but we shouldn't get here.
cached_stats_modulo_[n]->CopyFromMat(stats);
}
} else { // store in the ring buffer.
InitRingBufferIfNeeded();
if (!cached_stats_ring_.empty()) {
int32 index = frame % cached_stats_ring_.size();
cached_stats_ring_[index].first = frame;
cached_stats_ring_[index].second.CopyFromMat(stats);
}
}
}
OnlineCmvn::~OnlineCmvn() {
for (size_t i = 0; i < cached_stats_modulo_.size(); i++)
delete cached_stats_modulo_[i];
cached_stats_modulo_.clear();
}
void OnlineCmvn::ComputeStatsForFrame(int32 frame,
MatrixBase<double> *stats_out) {
KALDI_ASSERT(frame >= 0 && frame < src_->NumFramesReady());
int32 dim = this->Dim(), cur_frame;
GetMostRecentCachedFrame(frame, &cur_frame, stats_out);
Vector<BaseFloat> &feats(temp_feats_);
Vector<double> &feats_dbl(temp_feats_dbl_);
while (cur_frame < frame) {
cur_frame++;
src_->GetFrame(cur_frame, &feats);
feats_dbl.CopyFromVec(feats);
stats_out->Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
if (opts_.normalize_variance)
stats_out->Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
(*stats_out)(0, dim) += 1.0;
// it's a sliding buffer; a frame at the back may be
// leaving the buffer so we have to subtract that.
int32 prev_frame = cur_frame - opts_.cmn_window;
if (prev_frame >= 0) {
// we need to subtract frame prev_f from the stats.
src_->GetFrame(prev_frame, &feats);
feats_dbl.CopyFromVec(feats);
stats_out->Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
if (opts_.normalize_variance)
stats_out->Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
(*stats_out)(0, dim) -= 1.0;
}
CacheFrame(cur_frame, (*stats_out));
}
}
// static
void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
const MatrixBase<double> &global_stats,
const OnlineCmvnOptions &opts,
MatrixBase<double> *stats) {
if (speaker_stats.NumRows() == 2 && !opts.normalize_variance) {
// this is just for efficiency: don't operate on the variance if it's not
// needed.
int32 cols = speaker_stats.NumCols(); // dim + 1
SubMatrix<double> stats_temp(*stats, 0, 1, 0, cols);
SmoothOnlineCmvnStats(speaker_stats.RowRange(0, 1),
global_stats.RowRange(0, 1),
opts, &stats_temp);
return;
}
int32 dim = stats->NumCols() - 1;
double cur_count = (*stats)(0, dim);
// If count exceeded cmn_window it would be an error in how "window_stats"
// was accumulated.
KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
if (cur_count >= opts.cmn_window)
return;
if (speaker_stats.NumRows() != 0) { // if we have speaker stats..
double count_from_speaker = opts.cmn_window - cur_count,
speaker_count = speaker_stats(0, dim);
if (count_from_speaker > opts.speaker_frames)
count_from_speaker = opts.speaker_frames;
if (count_from_speaker > speaker_count)
count_from_speaker = speaker_count;
if (count_from_speaker > 0.0)
stats->AddMat(count_from_speaker / speaker_count,
speaker_stats);
cur_count = (*stats)(0, dim);
}
if (cur_count >= opts.cmn_window)
return;
if (global_stats.NumRows() != 0) {
double count_from_global = opts.cmn_window - cur_count,
global_count = global_stats(0, dim);
KALDI_ASSERT(global_count > 0.0);
if (count_from_global > opts.global_frames)
count_from_global = opts.global_frames;
if (count_from_global > 0.0)
stats->AddMat(count_from_global / global_count,
global_stats);
} else {
KALDI_ERR << "Global CMN stats are required";
}
}
void OnlineCmvn::GetFrame(int32 frame,
VectorBase<BaseFloat> *feat) {
src_->GetFrame(frame, feat);
KALDI_ASSERT(feat->Dim() == this->Dim());
int32 dim = feat->Dim();
Matrix<double> &stats(temp_stats_);
stats.Resize(2, dim + 1, kUndefined); // Will do nothing if size was correct.
if (frozen_state_.NumRows() != 0) { // the CMVN state has been frozen.
stats.CopyFromMat(frozen_state_);
} else {
// first get the raw CMVN stats (this involves caching..)
this->ComputeStatsForFrame(frame, &stats);
// now smooth them.
SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
orig_state_.global_cmvn_stats,
opts_,
&stats);
}
if (!skip_dims_.empty())
FakeStatsForSomeDims(skip_dims_, &stats);
// call the function ApplyCmvn declared in ../transform/cmvn.h, which
// requires a matrix.
// 1 row; num-cols == dim; stride == dim.
SubMatrix<BaseFloat> feat_mat(feat->Data(), 1, dim, dim);
// the function ApplyCmvn takes a matrix, so form a one-row matrix to give it.
if (opts_.normalize_mean)
ApplyCmvn(stats, opts_.normalize_variance, &feat_mat);
else
KALDI_ASSERT(!opts_.normalize_variance);
}
void OnlineCmvn::Freeze(int32 cur_frame) {
int32 dim = this->Dim();
Matrix<double> stats(2, dim + 1);
// get the raw CMVN stats
this->ComputeStatsForFrame(cur_frame, &stats);
// now smooth them.
SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
orig_state_.global_cmvn_stats,
opts_,
&stats);
this->frozen_state_ = stats;
}
void OnlineCmvn::GetState(int32 cur_frame,
OnlineCmvnState *state_out) {
*state_out = this->orig_state_;
{ // This block updates state_out->speaker_cmvn_stats
int32 dim = this->Dim();
if (state_out->speaker_cmvn_stats.NumRows() == 0)
state_out->speaker_cmvn_stats.Resize(2, dim + 1);
Vector<BaseFloat> feat(dim);
Vector<double> feat_dbl(dim);
for (int32 t = 0; t <= cur_frame; t++) {
src_->GetFrame(t, &feat);
feat_dbl.CopyFromVec(feat);
state_out->speaker_cmvn_stats(0, dim) += 1.0;
state_out->speaker_cmvn_stats.Row(0).Range(0, dim).AddVec(1.0, feat_dbl);
state_out->speaker_cmvn_stats.Row(1).Range(0, dim).AddVec2(1.0, feat_dbl);
}
}
// Store any frozen state (the effect of the user possibly
// having called Freeze().
state_out->frozen_state = frozen_state_;
}
void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
KALDI_ASSERT(cached_stats_modulo_.empty() &&
"You cannot call SetState() after processing data.");
orig_state_ = cmvn_state;
frozen_state_ = cmvn_state.frozen_state;
}
int32 OnlineSpliceFrames::NumFramesReady() const {
int32 num_frames = src_->NumFramesReady();
if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
return num_frames;
else
return std::max<int32>(0, num_frames - right_context_);
}
void OnlineSpliceFrames::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
KALDI_ASSERT(left_context_ >= 0 && right_context_ >= 0);
KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
int32 dim_in = src_->Dim();
KALDI_ASSERT(feat->Dim() == dim_in * (1 + left_context_ + right_context_));
int32 T = src_->NumFramesReady();
for (int32 t2 = frame - left_context_; t2 <= frame + right_context_; t2++) {
int32 t2_limited = t2;
if (t2_limited < 0) t2_limited = 0;
if (t2_limited >= T) t2_limited = T - 1;
int32 n = t2 - (frame - left_context_); // 0 for left-most frame,
// increases to the right.
SubVector<BaseFloat> part(*feat, n * dim_in, dim_in);
src_->GetFrame(t2_limited, &part);
}
}
OnlineTransform::OnlineTransform(const MatrixBase<BaseFloat> &transform,
OnlineFeatureInterface *src):
src_(src) {
int32 src_dim = src_->Dim();
if (transform.NumCols() == src_dim) { // Linear transform
linear_term_ = transform;
offset_.Resize(transform.NumRows()); // Resize() will zero it.
} else if (transform.NumCols() == src_dim + 1) { // Affine transform
linear_term_ = transform.Range(0, transform.NumRows(), 0, src_dim);
offset_.Resize(transform.NumRows());
offset_.CopyColFromMat(transform, src_dim);
} else {
KALDI_ERR << "Dimension mismatch: source features have dimension "
<< src_dim << " and LDA #cols is " << transform.NumCols();
}
}
void OnlineTransform::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
Vector<BaseFloat> input_feat(linear_term_.NumCols());
src_->GetFrame(frame, &input_feat);
feat->CopyFromVec(offset_);
feat->AddMatVec(1.0, linear_term_, kNoTrans, input_feat, 1.0);
}
void OnlineTransform::GetFrames(
const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
int32 num_frames = feats->NumRows(),
input_dim = linear_term_.NumCols();
Matrix<BaseFloat> input_feats(num_frames, input_dim, kUndefined);
src_->GetFrames(frames, &input_feats);
feats->CopyRowsFromVec(offset_);
feats->AddMatMat(1.0, input_feats, kNoTrans, linear_term_, kTrans, 1.0);
}
int32 OnlineDeltaFeature::Dim() const {
int32 src_dim = src_->Dim();
return src_dim * (1 + opts_.order);
}
int32 OnlineDeltaFeature::NumFramesReady() const {
int32 num_frames = src_->NumFramesReady(),
context = opts_.order * opts_.window;
// "context" is the number of frames on the left or (more relevant
// here) right which we need in order to produce the output.
if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
return num_frames;
else
return std::max<int32>(0, num_frames - context);
}
void OnlineDeltaFeature::GetFrame(int32 frame,
VectorBase<BaseFloat> *feat) {
KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
KALDI_ASSERT(feat->Dim() == Dim());
// We'll produce a temporary matrix containing the features we want to
// compute deltas on, but truncated to the necessary context.
int32 context = opts_.order * opts_.window;
int32 left_frame = frame - context,
right_frame = frame + context,
src_frames_ready = src_->NumFramesReady();
if (left_frame < 0) left_frame = 0;
if (right_frame >= src_frames_ready)
right_frame = src_frames_ready - 1;
KALDI_ASSERT(right_frame >= left_frame);
int32 temp_num_frames = right_frame + 1 - left_frame,
src_dim = src_->Dim();
Matrix<BaseFloat> temp_src(temp_num_frames, src_dim);
for (int32 t = left_frame; t <= right_frame; t++) {
SubVector<BaseFloat> temp_row(temp_src, t - left_frame);
src_->GetFrame(t, &temp_row);
}
int32 temp_t = frame - left_frame; // temp_t is the offset of frame "frame"
// within temp_src
delta_features_.Process(temp_src, temp_t, feat);
}
OnlineDeltaFeature::OnlineDeltaFeature(const DeltaFeaturesOptions &opts,
OnlineFeatureInterface *src):
src_(src), opts_(opts), delta_features_(opts) { }
void OnlineCacheFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
KALDI_ASSERT(frame >= 0);
if (static_cast<size_t>(frame) < cache_.size() && cache_[frame] != NULL) {
feat->CopyFromVec(*(cache_[frame]));
} else {
if (static_cast<size_t>(frame) >= cache_.size())
cache_.resize(frame + 1, NULL);
int32 dim = this->Dim();
cache_[frame] = new Vector<BaseFloat>(dim);
// The following call will crash if frame "frame" is not ready.
src_->GetFrame(frame, cache_[frame]);
feat->CopyFromVec(*(cache_[frame]));
}
}
void OnlineCacheFeature::GetFrames(
const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
int32 num_frames = frames.size();
// non_cached_frames will be the subset of 't' values in 'frames' which were
// not previously cached, which we therefore need to get from src_.
std::vector<int32> non_cached_frames;
// 'non_cached_indexes' stores the indexes 'i' into 'frames' corresponding to
// the corresponding frames in 'non_cached_frames'.
std::vector<int32> non_cached_indexes;
non_cached_frames.reserve(frames.size());
non_cached_indexes.reserve(frames.size());
for (int32 i = 0; i < num_frames; i++) {
int32 t = frames[i];
if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
feats->Row(i).CopyFromVec(*(cache_[t]));
} else {
non_cached_frames.push_back(t);
non_cached_indexes.push_back(i);
}
}
if (non_cached_frames.empty())
return;
int32 num_non_cached_frames = non_cached_frames.size(),
dim = this->Dim();
Matrix<BaseFloat> non_cached_feats(num_non_cached_frames, dim,
kUndefined);
src_->GetFrames(non_cached_frames, &non_cached_feats);
for (int32 i = 0; i < num_non_cached_frames; i++) {
int32 t = non_cached_frames[i];
if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
// We can reach this point due to repeat indexes in 'non_cached_frames'.
feats->Row(non_cached_indexes[i]).CopyFromVec(*(cache_[t]));
} else {
SubVector<BaseFloat> this_feat(non_cached_feats, i);
feats->Row(non_cached_indexes[i]).CopyFromVec(this_feat);
if (static_cast<size_t>(t) >= cache_.size())
cache_.resize(t + 1, NULL);
cache_[t] = new Vector<BaseFloat>(this_feat);
}
}
}
void OnlineCacheFeature::ClearCache() {
for (size_t i = 0; i < cache_.size(); i++)
delete cache_[i];
cache_.resize(0);
}
void OnlineAppendFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
KALDI_ASSERT(feat->Dim() == Dim());
SubVector<BaseFloat> feat1(*feat, 0, src1_->Dim());
SubVector<BaseFloat> feat2(*feat, src1_->Dim(), src2_->Dim());
src1_->GetFrame(frame, &feat1);
src2_->GetFrame(frame, &feat2);
};
} // namespace kaldi

@ -1,632 +0,0 @@
// feat/online-feature.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// 2014 Yanqing Sun, Junjie Wang,
// Daniel Povey, Korbinian Riedhammer
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_ONLINE_FEATURE_H_
#define KALDI_FEAT_ONLINE_FEATURE_H_
#include <string>
#include <vector>
#include <deque>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
#include "feat/feature-functions.h"
#include "feat/feature-mfcc.h"
#include "feat/feature-plp.h"
#include "feat/feature-fbank.h"
#include "feat/online-feature-itf.h"
namespace kaldi {
/// @addtogroup onlinefeat OnlineFeatureExtraction
/// @{
/// This class serves as a storage for feature vectors with an option to limit
/// the memory usage by removing old elements. The deleted frames indices are
/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
/// provides the indices as if no deletion was being performed.
/// This is useful when processing very long recordings which would otherwise
/// cause the memory to eventually blow up when the features are not being removed.
class RecyclingVector {
public:
/// By default it does not remove any elements.
RecyclingVector(int items_to_hold = -1);
/// The ownership is being retained by this collection - do not delete the item.
Vector<BaseFloat> *At(int index) const;
/// The ownership of the item is passed to this collection - do not delete the item.
void PushBack(Vector<BaseFloat> *item);
/// This method returns the size as if no "recycling" had happened,
/// i.e. equivalent to the number of times the PushBack method has been called.
int Size() const;
~RecyclingVector();
private:
std::deque<Vector<BaseFloat>*> items_;
int items_to_hold_;
int first_available_index_;
};
/// This is a templated class for online feature extraction;
/// it's templated on a class like MfccComputer or PlpComputer
/// that does the basic feature extraction.
template<class C>
class OnlineGenericBaseFeature: public OnlineBaseFeature {
public:
//
// First, functions that are present in the interface:
//
virtual int32 Dim() const { return computer_.Dim(); }
// Note: IsLastFrame() will only ever return true if you have called
// InputFinished() (and this frame is the last frame).
virtual bool IsLastFrame(int32 frame) const {
return input_finished_ && frame == NumFramesReady() - 1;
}
virtual BaseFloat FrameShiftInSeconds() const {
return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
}
virtual int32 NumFramesReady() const { return features_.Size(); }
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
// Next, functions that are not in the interface.
// Constructor from options class
explicit OnlineGenericBaseFeature(const typename C::Options &opts);
// This would be called from the application, when you get
// more wave data. Note: the sampling_rate is only provided so
// the code can assert that it matches the sampling rate
// expected in the options.
virtual void AcceptWaveform(BaseFloat sampling_rate,
const VectorBase<BaseFloat> &waveform);
// InputFinished() tells the class you won't be providing any
// more waveform. This will help flush out the last frame or two
// of features, in the case where snip-edges == false; it also
// affects the return value of IsLastFrame().
virtual void InputFinished();
private:
// This function computes any additional feature frames that it is possible to
// compute from 'waveform_remainder_', which at this point may contain more
// than just a remainder-sized quantity (because AcceptWaveform() appends to
// waveform_remainder_ before calling this function). It adds these feature
// frames to features_, and shifts off any now-unneeded samples of input from
// waveform_remainder_ while incrementing waveform_offset_ by the same amount.
void ComputeFeatures();
void MaybeCreateResampler(BaseFloat sampling_rate);
C computer_; // class that does the MFCC or PLP or filterbank computation
// resampler in cases when the input sampling frequency is not equal to
// the expected sampling rate
std::unique_ptr<LinearResample> resampler_;
FeatureWindowFunction window_function_;
// features_ is the Mfcc or Plp or Fbank features that we have already computed.
RecyclingVector features_;
// True if the user has called "InputFinished()"
bool input_finished_;
// The sampling frequency, extracted from the config. Should
// be identical to the waveform supplied.
BaseFloat sampling_frequency_;
// waveform_offset_ is the number of samples of waveform that we have
// already discarded, i.e. that were prior to 'waveform_remainder_'.
int64 waveform_offset_;
// waveform_remainder_ is a short piece of waveform that we may need to keep
// after extracting all the whole frames we can (whatever length of feature
// will be required for the next phase of computation).
Vector<BaseFloat> waveform_remainder_;
};
typedef OnlineGenericBaseFeature<MfccComputer> OnlineMfcc;
typedef OnlineGenericBaseFeature<PlpComputer> OnlinePlp;
typedef OnlineGenericBaseFeature<FbankComputer> OnlineFbank;
/// This class takes a Matrix<BaseFloat> and wraps it as an
/// OnlineFeatureInterface: this can be useful where some earlier stage of
/// feature processing has been done offline but you want to use part of the
/// online pipeline.
class OnlineMatrixFeature: public OnlineFeatureInterface {
public:
/// Caution: this class maintains the const reference from the constructor, so
/// don't let it go out of scope while this object exists.
explicit OnlineMatrixFeature(const MatrixBase<BaseFloat> &mat): mat_(mat) { }
virtual int32 Dim() const { return mat_.NumCols(); }
virtual BaseFloat FrameShiftInSeconds() const {
return 0.01f;
}
virtual int32 NumFramesReady() const { return mat_.NumRows(); }
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
feat->CopyFromVec(mat_.Row(frame));
}
virtual bool IsLastFrame(int32 frame) const {
return (frame + 1 == mat_.NumRows());
}
private:
const MatrixBase<BaseFloat> &mat_;
};
// Note the similarity with SlidingWindowCmnOptions, but there
// are also differences. One which doesn't appear in the config
// itself, because it's a difference between the setups, is that
// in OnlineCmn, we carry over data from the previous utterance,
// or, if no previous utterance is available, from global stats,
// or, if previous utterances are available but the total amount
// of data is less than prev_frames, we pad with up to "global_frames"
// frames from the global stats.
struct OnlineCmvnOptions {
int32 cmn_window;
int32 speaker_frames; // must be <= cmn_window
int32 global_frames; // must be <= speaker_frames.
bool normalize_mean; // Must be true if normalize_variance==true.
bool normalize_variance;
int32 modulus; // not configurable from command line, relates to how the
// class computes the cmvn internally. smaller->more
// time-efficient but less memory-efficient. Must be >= 1.
int32 ring_buffer_size; // not configurable from command line; size of ring
// buffer used for caching CMVN stats. Must be >=
// modulus.
std::string skip_dims; // Colon-separated list of dimensions to skip normalization
// of, e.g. 13:14:15.
OnlineCmvnOptions():
cmn_window(600),
speaker_frames(600),
global_frames(200),
normalize_mean(true),
normalize_variance(false),
modulus(20),
ring_buffer_size(20),
skip_dims("") { }
void Check() const {
KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
&& modulus > 0);
}
void Register(ParseOptions *po) {
po->Register("cmn-window", &cmn_window, "Number of frames of sliding "
"context for cepstral mean normalization.");
po->Register("global-frames", &global_frames, "Number of frames of "
"global-average cepstral mean normalization stats to use for "
"first utterance of a speaker");
po->Register("speaker-frames", &speaker_frames, "Number of frames of "
"previous utterance(s) from this speaker to use in cepstral "
"mean normalization");
// we name the config string "norm-vars" for compatibility with
// ../featbin/apply-cmvn.cc
po->Register("norm-vars", &normalize_variance, "If true, do "
"cepstral variance normalization in addition to cepstral mean "
"normalization ");
po->Register("norm-means", &normalize_mean, "If true, do mean normalization "
"(note: you cannot normalize the variance but not the mean)");
po->Register("skip-dims", &skip_dims, "Dimensions to skip normalization of "
"(colon-separated list of integers)");}
};
/** Struct OnlineCmvnState stores the state of CMVN adaptation between
utterances (but not the state of the computation within an utterance). It
stores the global CMVN stats and the stats of the current speaker (if we
have seen previous utterances for this speaker), and possibly will have a
member "frozen_state": if the user has called the function Freeze() of class
OnlineCmvn, to fix the CMVN so we can estimate fMLLR on top of the fixed
value of cmvn. If nonempty, "frozen_state" will reflect how we were
normalizing the mean and (if applicable) variance at the time when that
function was called.
*/
struct OnlineCmvnState {
// The following is the total CMVN stats for this speaker (up till now), in
// the same format.
Matrix<double> speaker_cmvn_stats;
// The following is the global CMVN stats, in the usual
// format, of dimension 2 x (dim+1), as [ sum-stats count
// sum-squared-stats 0 ]
Matrix<double> global_cmvn_stats;
// If nonempty, contains CMVN stats representing the "frozen" state
// of CMVN that reflects how we were normalizing the data when the
// user called the Freeze() function in class OnlineCmvn.
Matrix<double> frozen_state;
OnlineCmvnState() { }
explicit OnlineCmvnState(const Matrix<double> &global_stats):
global_cmvn_stats(global_stats) { }
// Copy constructor
OnlineCmvnState(const OnlineCmvnState &other);
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary);
// Use the default assignment operator.
};
/**
This class does an online version of the cepstral mean and [optionally]
variance, but note that this is not equivalent to the offline version. This
is necessarily so, as the offline computation involves looking into the
future. If you plan to use features normalized with this type of CMVN then
you need to train in a `matched' way, i.e. with the same type of features.
We normally only do so in the "online" GMM-based decoding, e.g. in
online2bin/online2-wav-gmm-latgen-faster.cc; see also the script
steps/online/prepare_online_decoding.sh and steps/online/decode.sh.
In the steady state (in the middle of a long utterance), this class
accumulates CMVN statistics from the previous "cmn_window" frames (default 600
frames, or 6 seconds), and uses these to normalize the mean and possibly
variance of the current frame.
The config variables "speaker_frames" and "global_frames" relate to what
happens at the beginning of the utterance when we have seen fewer than
"cmn_window" frames of context, and so might not have very good stats to
normalize with. Basically, we first augment any existing stats with up
to "speaker_frames" frames of stats from previous utterances of the current
speaker, and if this doesn't take us up to the required "cmn_window" frame
count, we further augment with up to "global_frames" frames of global
stats. The global stats are CMVN stats accumulated from training or testing
data, that give us a reasonable source of mean and variance for "typical"
data.
*/
class OnlineCmvn: public OnlineFeatureInterface {
public:
//
// First, functions that are present in the interface:
//
virtual int32 Dim() const { return src_->Dim(); }
virtual bool IsLastFrame(int32 frame) const {
return src_->IsLastFrame(frame);
}
virtual BaseFloat FrameShiftInSeconds() const {
return src_->FrameShiftInSeconds();
}
// The online cmvn does not introduce any additional latency.
virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
//
// Next, functions that are not in the interface.
//
/// Initializer that sets the cmvn state. If you don't have previous
/// utterances from the same speaker you are supposed to initialize the CMVN
/// state from some global CMVN stats, which you can get from summing all cmvn
/// stats you have in your training data using "sum-matrix". This just gives
/// it a reasonable starting point at the start of the file.
/// If you do have previous utterances from the same speaker or at least a
/// similar environment, you are supposed to initialize it by calling GetState
/// from the previous utterance
OnlineCmvn(const OnlineCmvnOptions &opts,
const OnlineCmvnState &cmvn_state,
OnlineFeatureInterface *src);
/// Initializer that does not set the cmvn state:
/// after calling this, you should call SetState().
OnlineCmvn(const OnlineCmvnOptions &opts,
OnlineFeatureInterface *src);
// Outputs any state information from this utterance to "cmvn_state".
// The value of "cmvn_state" before the call does not matter: the output
// depends on the value of OnlineCmvnState the class was initialized
// with, the input feature values up to cur_frame, and the effects
// of the user possibly having called Freeze().
// If cur_frame is -1, it will just output the unmodified original
// state that was supplied to this object.
void GetState(int32 cur_frame,
OnlineCmvnState *cmvn_state);
// This function can be used to modify the state of the CMVN computation
// from outside, but must only be called before you have processed any data
// (otherwise it will crash). This "state" is really just the information
// that is propagated between utterances, not the state of the computation
// inside an utterance.
void SetState(const OnlineCmvnState &cmvn_state);
// From this point it will freeze the CMN to what it would have been if
// measured at frame "cur_frame", and it will stop it from changing
// further. This also applies retroactively for this utterance, so if you
// call GetFrame() on previous frames, it will use the CMVN stats
// from cur_frame; and it applies in the future too if you then
// call OutputState() and use this state to initialize the next
// utterance's CMVN object.
void Freeze(int32 cur_frame);
virtual ~OnlineCmvn();
private:
/// Smooth the CMVN stats "stats" (which are stored in the normal format as a
/// 2 x (dim+1) matrix), by possibly adding some stats from "global_stats"
/// and/or "speaker_stats", controlled by the config. The best way to
/// understand the smoothing rule we use is just to look at the code.
static void SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
const MatrixBase<double> &global_stats,
const OnlineCmvnOptions &opts,
MatrixBase<double> *stats);
/// Get the most recent cached frame of CMVN stats. [If no frames
/// were cached, sets up empty stats for frame zero and returns that].
void GetMostRecentCachedFrame(int32 frame,
int32 *cached_frame,
MatrixBase<double> *stats);
/// Cache this frame of stats.
void CacheFrame(int32 frame, const MatrixBase<double> &stats);
/// Initialize ring buffer for caching stats.
inline void InitRingBufferIfNeeded();
/// Computes the raw CMVN stats for this frame, making use of (and updating if
/// necessary) the cached statistics in raw_stats_. This means the (x,
/// x^2, count) stats for the last up to opts_.cmn_window frames.
void ComputeStatsForFrame(int32 frame,
MatrixBase<double> *stats);
OnlineCmvnOptions opts_;
std::vector<int32> skip_dims_; // Skip CMVN for these dimensions. Derived from opts_.
OnlineCmvnState orig_state_; // reflects the state before we saw this
// utterance.
Matrix<double> frozen_state_; // If the user called Freeze(), this variable
// will reflect the CMVN state that we froze
// at.
// The variable below reflects the raw (count, x, x^2) statistics of the
// input, computed every opts_.modulus frames. raw_stats_[n / opts_.modulus]
// contains the (count, x, x^2) statistics for the frames from
// std::max(0, n - opts_.cmn_window) through n.
std::vector<Matrix<double>*> cached_stats_modulo_;
// the variable below is a ring-buffer of cached stats. the int32 is the
// frame index.
std::vector<std::pair<int32, Matrix<double> > > cached_stats_ring_;
// Some temporary variables used inside functions of this class, which
// put here to avoid reallocation.
Matrix<double> temp_stats_;
Vector<BaseFloat> temp_feats_;
Vector<double> temp_feats_dbl_;
OnlineFeatureInterface *src_; // Not owned here
};
struct OnlineSpliceOptions {
int32 left_context;
int32 right_context;
OnlineSpliceOptions(): left_context(4), right_context(4) { }
void Register(ParseOptions *po) {
po->Register("left-context", &left_context, "Left-context for frame "
"splicing prior to LDA");
po->Register("right-context", &right_context, "Right-context for frame "
"splicing prior to LDA");
}
};
class OnlineSpliceFrames: public OnlineFeatureInterface {
public:
//
// First, functions that are present in the interface:
//
virtual int32 Dim() const {
return src_->Dim() * (1 + left_context_ + right_context_);
}
virtual bool IsLastFrame(int32 frame) const {
return src_->IsLastFrame(frame);
}
virtual BaseFloat FrameShiftInSeconds() const {
return src_->FrameShiftInSeconds();
}
virtual int32 NumFramesReady() const;
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
//
// Next, functions that are not in the interface.
//
OnlineSpliceFrames(const OnlineSpliceOptions &opts,
OnlineFeatureInterface *src):
left_context_(opts.left_context), right_context_(opts.right_context),
src_(src) { }
private:
int32 left_context_;
int32 right_context_;
OnlineFeatureInterface *src_; // Not owned here
};
/// This online-feature class implements any affine or linear transform.
class OnlineTransform: public OnlineFeatureInterface {
public:
//
// First, functions that are present in the interface:
//
virtual int32 Dim() const { return offset_.Dim(); }
virtual bool IsLastFrame(int32 frame) const {
return src_->IsLastFrame(frame);
}
virtual BaseFloat FrameShiftInSeconds() const {
return src_->FrameShiftInSeconds();
}
virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
virtual void GetFrames(const std::vector<int32> &frames,
MatrixBase<BaseFloat> *feats);
//
// Next, functions that are not in the interface.
//
/// The transform can be a linear transform, or an affine transform
/// where the last column is the offset.
OnlineTransform(const MatrixBase<BaseFloat> &transform,
OnlineFeatureInterface *src);
private:
OnlineFeatureInterface *src_; // Not owned here
Matrix<BaseFloat> linear_term_;
Vector<BaseFloat> offset_;
};
class OnlineDeltaFeature: public OnlineFeatureInterface {
public:
//
// First, functions that are present in the interface:
//
virtual int32 Dim() const;
virtual bool IsLastFrame(int32 frame) const {
return src_->IsLastFrame(frame);
}
virtual BaseFloat FrameShiftInSeconds() const {
return src_->FrameShiftInSeconds();
}
virtual int32 NumFramesReady() const;
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
//
// Next, functions that are not in the interface.
//
OnlineDeltaFeature(const DeltaFeaturesOptions &opts,
OnlineFeatureInterface *src);
private:
OnlineFeatureInterface *src_; // Not owned here
DeltaFeaturesOptions opts_;
DeltaFeatures delta_features_; // This class contains just a few
// coefficients.
};
/// This feature type can be used to cache its input, to avoid
/// repetition of computation in a multi-pass decoding context.
class OnlineCacheFeature: public OnlineFeatureInterface {
public:
virtual int32 Dim() const { return src_->Dim(); }
virtual bool IsLastFrame(int32 frame) const {
return src_->IsLastFrame(frame);
}
virtual BaseFloat FrameShiftInSeconds() const {
return src_->FrameShiftInSeconds();
}
virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
virtual void GetFrames(const std::vector<int32> &frames,
MatrixBase<BaseFloat> *feats);
virtual ~OnlineCacheFeature() { ClearCache(); }
// Things that are not in the shared interface:
void ClearCache(); // this should be called if you change the underlying
// features in some way.
explicit OnlineCacheFeature(OnlineFeatureInterface *src): src_(src) { }
private:
OnlineFeatureInterface *src_; // Not owned here
std::vector<Vector<BaseFloat>* > cache_;
};
/// This online-feature class implements combination of two feature
/// streams (such as pitch, plp) into one stream.
class OnlineAppendFeature: public OnlineFeatureInterface {
public:
virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); }
virtual bool IsLastFrame(int32 frame) const {
return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame));
}
// Hopefully sources have the same rate
virtual BaseFloat FrameShiftInSeconds() const {
return src1_->FrameShiftInSeconds();
}
virtual int32 NumFramesReady() const {
return std::min(src1_->NumFramesReady(), src2_->NumFramesReady());
}
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
virtual ~OnlineAppendFeature() { }
OnlineAppendFeature(OnlineFeatureInterface *src1,
OnlineFeatureInterface *src2): src1_(src1), src2_(src2) { }
private:
OnlineFeatureInterface *src1_;
OnlineFeatureInterface *src2_;
};
/// @} End of "addtogroup onlinefeat"
} // namespace kaldi
#endif // KALDI_FEAT_ONLINE_FEATURE_H_

File diff suppressed because it is too large Load Diff

@ -1,450 +0,0 @@
// feat/pitch-functions.h
// Copyright 2013 Pegah Ghahremani
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// 2014 Yanqing Sun, Junjie Wang,
// Daniel Povey, Korbinian Riedhammer
// Xin Lei
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
#define KALDI_FEAT_PITCH_FUNCTIONS_H_
#include <cassert>
#include <cstdlib>
#include <string>
#include <vector>
#include "base/kaldi-error.h"
#include "feat/mel-computations.h"
#include "feat/online-feature-itf.h"
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
struct PitchExtractionOptions {
// FrameExtractionOptions frame_opts;
BaseFloat samp_freq; // sample frequency in hertz
BaseFloat frame_shift_ms; // in milliseconds.
BaseFloat frame_length_ms; // in milliseconds.
BaseFloat preemph_coeff; // Preemphasis coefficient. [use is deprecated.]
BaseFloat min_f0; // min f0 to search (Hz)
BaseFloat max_f0; // max f0 to search (Hz)
BaseFloat soft_min_f0; // Minimum f0, applied in soft way, must not
// exceed min-f0
BaseFloat penalty_factor; // cost factor for FO change
BaseFloat lowpass_cutoff; // cutoff frequency for Low pass filter
BaseFloat resample_freq; // Integer that determines filter width when
// upsampling NCCF
BaseFloat delta_pitch; // the pitch tolerance in pruning lags
BaseFloat nccf_ballast; // Increasing this factor reduces NCCF for
// quiet frames, helping ensure pitch
// continuity in unvoiced region
int32 lowpass_filter_width; // Integer that determines filter width of
// lowpass filter
int32 upsample_filter_width; // Integer that determines filter width when
// upsampling NCCF
// Below are newer config variables, not present in the original paper,
// that relate to the online pitch extraction algorithm.
// The maximum number of frames of latency that we allow the pitch-processing
// to introduce, for online operation. If you set this to a large value,
// there would be no inaccuracy from the Viterbi traceback (but it might make
// you wait to see the pitch). This is not very relevant for the online
// operation: normalization-right-context is more relevant, you
// can just leave this value at zero.
int32 max_frames_latency;
// Only relevant for the function ComputeKaldiPitch which is called by
// compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
// this size. This affects the energy normalization which has a small effect
// on the resulting features, especially at the beginning of a file. For best
// compatibility with online operation (e.g. if you plan to train models for
// the online-deocding setup), you might want to set this to a small value,
// like one frame.
int32 frames_per_chunk;
// Only relevant for the function ComputeKaldiPitch which is called by
// compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
// nonzero. If true, it will query the features as soon as they are
// available, which simulates the first-pass features you would get in online
// decoding. If false, the features you will get will be the same as those
// available at the end of the utterance, after InputFinished() has been
// called: e.g. during lattice rescoring.
bool simulate_first_pass_online;
// Only relevant for online operation or when emulating online operation
// (e.g. when setting frames_per_chunk). This is the frame-index on which we
// recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
// segment ends before this we do it when the segment ends. We do this by
// re-computing the signal average energy, which affects the NCCF via the
// "ballast term", scaling the resampled NCCF by a factor derived from the
// average change in the "ballast term", and re-doing the backtrace
// computation. Making this infinity would be the most exact, but would
// introduce unwanted latency at the end of long utterances, for little
// benefit.
int32 recompute_frame;
// This is a "hidden config" used only for testing the online pitch
// extraction. If true, we compute the signal root-mean-squared for the
// ballast term, only up to the current frame, rather than the end of the
// current chunk of signal. This makes the output insensitive to the
// chunking, which is useful for testing purposes.
bool nccf_ballast_online;
bool snip_edges;
PitchExtractionOptions():
samp_freq(16000),
frame_shift_ms(10.0),
frame_length_ms(25.0),
preemph_coeff(0.0),
min_f0(50),
max_f0(400),
soft_min_f0(10.0),
penalty_factor(0.1),
lowpass_cutoff(1000),
resample_freq(4000),
delta_pitch(0.005),
nccf_ballast(7000),
lowpass_filter_width(1),
upsample_filter_width(5),
max_frames_latency(0),
frames_per_chunk(0),
simulate_first_pass_online(false),
recompute_frame(500),
nccf_ballast_online(false),
snip_edges(true) { }
void Register(OptionsItf *opts) {
opts->Register("sample-frequency", &samp_freq,
"Waveform data sample frequency (must match the waveform "
"file, if specified there)");
opts->Register("frame-length", &frame_length_ms, "Frame length in "
"milliseconds");
opts->Register("frame-shift", &frame_shift_ms, "Frame shift in "
"milliseconds");
opts->Register("preemphasis-coefficient", &preemph_coeff,
"Coefficient for use in signal preemphasis (deprecated)");
opts->Register("min-f0", &min_f0,
"min. F0 to search for (Hz)");
opts->Register("max-f0", &max_f0,
"max. F0 to search for (Hz)");
opts->Register("soft-min-f0", &soft_min_f0,
"Minimum f0, applied in soft way, must not exceed min-f0");
opts->Register("penalty-factor", &penalty_factor,
"cost factor for FO change.");
opts->Register("lowpass-cutoff", &lowpass_cutoff,
"cutoff frequency for LowPass filter (Hz) ");
opts->Register("resample-frequency", &resample_freq,
"Frequency that we down-sample the signal to. Must be "
"more than twice lowpass-cutoff");
opts->Register("delta-pitch", &delta_pitch,
"Smallest relative change in pitch that our algorithm "
"measures");
opts->Register("nccf-ballast", &nccf_ballast,
"Increasing this factor reduces NCCF for quiet frames");
opts->Register("nccf-ballast-online", &nccf_ballast_online,
"This is useful mainly for debug; it affects how the NCCF "
"ballast is computed.");
opts->Register("lowpass-filter-width", &lowpass_filter_width,
"Integer that determines filter width of "
"lowpass filter, more gives sharper filter");
opts->Register("upsample-filter-width", &upsample_filter_width,
"Integer that determines filter width when upsampling NCCF");
opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
"offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
"you can set it to a small nonzero value, such as 10, for "
"better feature compatibility with online decoding (affects "
"energy normalization in the algorithm)");
opts->Register("simulate-first-pass-online", &simulate_first_pass_online,
"If true, compute-kaldi-pitch-feats will output features "
"that correspond to what an online decoder would see in the "
"first pass of decoding-- not the final version of the "
"features, which is the default. Relevant if "
"--frames-per-chunk > 0");
opts->Register("recompute-frame", &recompute_frame, "Only relevant for "
"online pitch extraction, or for compatibility with online "
"pitch extraction. A non-critical parameter; the frame at "
"which we recompute some of the forward pointers, after "
"revising our estimate of the signal energy. Relevant if"
"--frames-per-chunk > 0");
opts->Register("max-frames-latency", &max_frames_latency, "Maximum number "
"of frames of latency that we allow pitch tracking to "
"introduce into the feature processing (affects output only "
"if --frames-per-chunk > 0 and "
"--simulate-first-pass-online=true");
opts->Register("snip-edges", &snip_edges, "If this is set to false, the "
"incomplete frames near the ending edge won't be snipped, "
"so that the number of frames is the file size divided by "
"the frame-shift. This makes different types of features "
"give the same number of frames.");
}
/// Returns the window-size in samples, after resampling. This is the
/// "basic window size", not the full window size after extending by max-lag.
// Because of floating point representation, it is more reliable to divide
// by 1000 instead of multiplying by 0.001, but it is a bit slower.
int32 NccfWindowSize() const {
return static_cast<int32>(resample_freq * frame_length_ms / 1000.0);
}
/// Returns the window-shift in samples, after resampling.
int32 NccfWindowShift() const {
return static_cast<int32>(resample_freq * frame_shift_ms / 1000.0);
}
};
struct ProcessPitchOptions {
BaseFloat pitch_scale; // the final normalized-log-pitch feature is scaled
// with this value
BaseFloat pov_scale; // the final POV feature is scaled with this value
BaseFloat pov_offset; // An offset that can be added to the final POV
// feature (useful for online-decoding, where we don't
// do CMN to the pitch-derived features.
BaseFloat delta_pitch_scale;
BaseFloat delta_pitch_noise_stddev; // stddev of noise we add to delta-pitch
int32 normalization_left_context; // left-context used for sliding-window
// normalization
int32 normalization_right_context; // this should be reduced in online
// decoding to reduce latency
int32 delta_window;
int32 delay;
bool add_pov_feature;
bool add_normalized_log_pitch;
bool add_delta_pitch;
bool add_raw_log_pitch;
ProcessPitchOptions() :
pitch_scale(2.0),
pov_scale(2.0),
pov_offset(0.0),
delta_pitch_scale(10.0),
delta_pitch_noise_stddev(0.005),
normalization_left_context(75),
normalization_right_context(75),
delta_window(2),
delay(0),
add_pov_feature(true),
add_normalized_log_pitch(true),
add_delta_pitch(true),
add_raw_log_pitch(false) { }
void Register(ParseOptions *opts) {
opts->Register("pitch-scale", &pitch_scale,
"Scaling factor for the final normalized log-pitch value");
opts->Register("pov-scale", &pov_scale,
"Scaling factor for final POV (probability of voicing) "
"feature");
opts->Register("pov-offset", &pov_offset,
"This can be used to add an offset to the POV feature. "
"Intended for use in online decoding as a substitute for "
" CMN.");
opts->Register("delta-pitch-scale", &delta_pitch_scale,
"Term to scale the final delta log-pitch feature");
opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
"Standard deviation for noise we add to the delta log-pitch "
"(before scaling); should be about the same as delta-pitch "
"option to pitch creation. The purpose is to get rid of "
"peaks in the delta-pitch caused by discretization of pitch "
"values.");
opts->Register("normalization-left-context", &normalization_left_context,
"Left-context (in frames) for moving window normalization");
opts->Register("normalization-right-context", &normalization_right_context,
"Right-context (in frames) for moving window normalization");
opts->Register("delta-window", &delta_window,
"Number of frames on each side of central frame, to use for "
"delta window.");
opts->Register("delay", &delay,
"Number of frames by which the pitch information is "
"delayed.");
opts->Register("add-pov-feature", &add_pov_feature,
"If true, the warped NCCF is added to output features");
opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
"If true, the log-pitch with POV-weighted mean subtraction "
"over 1.5 second window is added to output features");
opts->Register("add-delta-pitch", &add_delta_pitch,
"If true, time derivative of log-pitch is added to output "
"features");
opts->Register("add-raw-log-pitch", &add_raw_log_pitch,
"If true, log(pitch) is added to output features");
}
};
// We don't want to expose the pitch-extraction internals here as it's
// quite complex, so we use a private implementation.
class OnlinePitchFeatureImpl;
// Note: to start on a new waveform, just construct a new version
// of this object.
class OnlinePitchFeature: public OnlineBaseFeature {
public:
explicit OnlinePitchFeature(const PitchExtractionOptions &opts);
virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }
virtual int32 NumFramesReady() const;
virtual BaseFloat FrameShiftInSeconds() const;
virtual bool IsLastFrame(int32 frame) const;
/// Outputs the two-dimensional feature consisting of (pitch, NCCF). You
/// should probably post-process this using class OnlineProcessPitch.
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
virtual void AcceptWaveform(BaseFloat sampling_rate,
const VectorBase<BaseFloat> &waveform);
virtual void InputFinished();
virtual ~OnlinePitchFeature();
private:
OnlinePitchFeatureImpl *impl_;
};
/// This online-feature class implements post processing of pitch features.
/// Inputs are original 2 dims (nccf, pitch). It can produce various
/// kinds of outputs, using the default options it will be (pov-feature,
/// normalized-log-pitch, delta-log-pitch).
class OnlineProcessPitch: public OnlineFeatureInterface {
public:
virtual int32 Dim() const { return dim_; }
virtual bool IsLastFrame(int32 frame) const {
if (frame <= -1)
return src_->IsLastFrame(-1);
else if (frame < opts_.delay)
return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
else
return src_->IsLastFrame(frame - opts_.delay);
}
virtual BaseFloat FrameShiftInSeconds() const {
return src_->FrameShiftInSeconds();
}
virtual int32 NumFramesReady() const;
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
virtual ~OnlineProcessPitch() { }
// Does not take ownership of "src".
OnlineProcessPitch(const ProcessPitchOptions &opts,
OnlineFeatureInterface *src);
private:
enum { kRawFeatureDim = 2}; // anonymous enum to define a constant.
// kRawFeatureDim defines the dimension
// of the input: (nccf, pitch)
ProcessPitchOptions opts_;
OnlineFeatureInterface *src_;
int32 dim_; // Output feature dimension, set in initializer.
struct NormalizationStats {
int32 cur_num_frames; // value of src_->NumFramesReady() when
// "mean_pitch" was set.
bool input_finished; // true if input data was finished when
// "mean_pitch" was computed.
double sum_pov; // sum of pov over relevant range
double sum_log_pitch_pov; // sum of log(pitch) * pov over relevant range
NormalizationStats(): cur_num_frames(-1), input_finished(false),
sum_pov(0.0), sum_log_pitch_pov(0.0) { }
};
std::vector<BaseFloat> delta_feature_noise_;
std::vector<NormalizationStats> normalization_stats_;
/// Computes and returns the POV feature for this frame.
/// Called from GetFrame().
inline BaseFloat GetPovFeature(int32 frame) const;
/// Computes and returns the delta-log-pitch feature for this frame.
/// Called from GetFrame().
inline BaseFloat GetDeltaPitchFeature(int32 frame);
/// Computes and returns the raw log-pitch feature for this frame.
/// Called from GetFrame().
inline BaseFloat GetRawLogPitchFeature(int32 frame) const;
/// Computes and returns the mean-subtracted log-pitch feature for this frame.
/// Called from GetFrame().
inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);
/// Computes the normalization window sizes.
inline void GetNormalizationWindow(int32 frame,
int32 src_frames_ready,
int32 *window_begin,
int32 *window_end) const;
/// Makes sure the entry in normalization_stats_ for this frame is up to date;
/// called from GetNormalizedLogPitchFeature.
inline void UpdateNormalizationStats(int32 frame);
};
/// This function extracts (pitch, NCCF) per frame, using the pitch extraction
/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech
/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian
/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014. The output will
/// have as many rows as there are frames, and two columns corresponding to
/// (NCCF, pitch)
void ComputeKaldiPitch(const PitchExtractionOptions &opts,
const VectorBase<BaseFloat> &wave,
Matrix<BaseFloat> *output);
/// This function processes the raw (NCCF, pitch) quantities computed by
/// ComputeKaldiPitch, and processes them into features. By default it will
/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch,
/// delta-of-raw-pitch), but this is configurable in the options. The number of
/// rows of "output" will be the number of frames (rows) in "input", and the
/// number of columns will be the number of different types of features
/// requested (by default, 3; 4 is the max). The four config variables
/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch,
/// --add-raw-log-pitch determine which features we create; by default we create
/// the first three.
void ProcessPitch(const ProcessPitchOptions &opts,
const MatrixBase<BaseFloat> &input,
Matrix<BaseFloat> *output);
/// This function combines ComputeKaldiPitch and ProcessPitch. The reason
/// why we need a separate function to do this is in order to be able to
/// accurately simulate the online pitch-processing, for testing and for
/// training models matched to the "first-pass" features. It is sensitive to
/// the variables in pitch_opts that relate to online processing,
/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online,
/// recompute_frame.
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts,
const ProcessPitchOptions &process_opts,
const VectorBase<BaseFloat> &wave,
Matrix<BaseFloat> *output);
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_PITCH_FUNCTIONS_H_

@ -1,377 +0,0 @@
// feat/resample.cc
// Copyright 2013 Pegah Ghahremani
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// 2014 Yanqing Sun, Junjie Wang
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <limits>
#include "feat/feature-functions.h"
#include "matrix/matrix-functions.h"
#include "feat/resample.h"
namespace kaldi {
LinearResample::LinearResample(int32 samp_rate_in_hz,
int32 samp_rate_out_hz,
BaseFloat filter_cutoff_hz,
int32 num_zeros):
samp_rate_in_(samp_rate_in_hz),
samp_rate_out_(samp_rate_out_hz),
filter_cutoff_(filter_cutoff_hz),
num_zeros_(num_zeros) {
KALDI_ASSERT(samp_rate_in_hz > 0.0 &&
samp_rate_out_hz > 0.0 &&
filter_cutoff_hz > 0.0 &&
filter_cutoff_hz*2 <= samp_rate_in_hz &&
filter_cutoff_hz*2 <= samp_rate_out_hz &&
num_zeros > 0);
// base_freq is the frequency of the repeating unit, which is the gcd
// of the input frequencies.
int32 base_freq = Gcd(samp_rate_in_, samp_rate_out_);
input_samples_in_unit_ = samp_rate_in_ / base_freq;
output_samples_in_unit_ = samp_rate_out_ / base_freq;
SetIndexesAndWeights();
Reset();
}
int64 LinearResample::GetNumOutputSamples(int64 input_num_samp,
bool flush) const {
// For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
// where tick_freq is the least common multiple of samp_rate_in_ and
// samp_rate_out_.
int32 tick_freq = Lcm(samp_rate_in_, samp_rate_out_);
int32 ticks_per_input_period = tick_freq / samp_rate_in_;
// work out the number of ticks in the time interval
// [ 0, input_num_samp/samp_rate_in_ ).
int64 interval_length_in_ticks = input_num_samp * ticks_per_input_period;
if (!flush) {
BaseFloat window_width = num_zeros_ / (2.0 * filter_cutoff_);
// To count the window-width in ticks we take the floor. This
// is because since we're looking for the largest integer num-out-samp
// that fits in the interval, which is open on the right, a reduction
// in interval length of less than a tick will never make a difference.
// For example, the largest integer in the interval [ 0, 2 ) and the
// largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one).
// So when we're subtracting the window-width we can ignore the fractional
// part.
int32 window_width_ticks = floor(window_width * tick_freq);
// The time-period of the output that we can sample gets reduced
// by the window-width (which is actually the distance from the
// center to the edge of the windowing function) if we're not
// "flushing the output".
interval_length_in_ticks -= window_width_ticks;
}
if (interval_length_in_ticks <= 0)
return 0;
int32 ticks_per_output_period = tick_freq / samp_rate_out_;
// Get the last output-sample in the closed interval, i.e. replacing [ ) with
// [ ]. Note: integer division rounds down. See
// http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of
// the notation.
int64 last_output_samp = interval_length_in_ticks / ticks_per_output_period;
// We need the last output-sample in the open interval, so if it takes us to
// the end of the interval exactly, subtract one.
if (last_output_samp * ticks_per_output_period == interval_length_in_ticks)
last_output_samp--;
// First output-sample index is zero, so the number of output samples
// is the last output-sample plus one.
int64 num_output_samp = last_output_samp + 1;
return num_output_samp;
}
void LinearResample::SetIndexesAndWeights() {
first_index_.resize(output_samples_in_unit_);
weights_.resize(output_samples_in_unit_);
double window_width = num_zeros_ / (2.0 * filter_cutoff_);
for (int32 i = 0; i < output_samples_in_unit_; i++) {
double output_t = i / static_cast<double>(samp_rate_out_);
double min_t = output_t - window_width, max_t = output_t + window_width;
// we do ceil on the min and floor on the max, because if we did it
// the other way around we would unnecessarily include indexes just
// outside the window, with zero coefficients. It's possible
// if the arguments to the ceil and floor expressions are integers
// (e.g. if filter_cutoff_ has an exact ratio with the sample rates),
// that we unnecessarily include something with a zero coefficient,
// but this is only a slight efficiency issue.
int32 min_input_index = ceil(min_t * samp_rate_in_),
max_input_index = floor(max_t * samp_rate_in_),
num_indices = max_input_index - min_input_index + 1;
first_index_[i] = min_input_index;
weights_[i].Resize(num_indices);
for (int32 j = 0; j < num_indices; j++) {
int32 input_index = min_input_index + j;
double input_t = input_index / static_cast<double>(samp_rate_in_),
delta_t = input_t - output_t;
// sign of delta_t doesn't matter.
weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_;
}
}
}
// inline
void LinearResample::GetIndexes(int64 samp_out,
int64 *first_samp_in,
int32 *samp_out_wrapped) const {
// A unit is the smallest nonzero amount of time that is an exact
// multiple of the input and output sample periods. The unit index
// is the answer to "which numbered unit we are in".
int64 unit_index = samp_out / output_samples_in_unit_;
// samp_out_wrapped is equal to samp_out % output_samples_in_unit_
*samp_out_wrapped = static_cast<int32>(samp_out -
unit_index * output_samples_in_unit_);
*first_samp_in = first_index_[*samp_out_wrapped] +
unit_index * input_samples_in_unit_;
}
void LinearResample::Resample(const VectorBase<BaseFloat> &input,
bool flush,
Vector<BaseFloat> *output) {
int32 input_dim = input.Dim();
int64 tot_input_samp = input_sample_offset_ + input_dim,
tot_output_samp = GetNumOutputSamples(tot_input_samp, flush);
KALDI_ASSERT(tot_output_samp >= output_sample_offset_);
output->Resize(tot_output_samp - output_sample_offset_);
// samp_out is the index into the total output signal, not just the part
// of it we are producing here.
for (int64 samp_out = output_sample_offset_;
samp_out < tot_output_samp;
samp_out++) {
int64 first_samp_in;
int32 samp_out_wrapped;
GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped);
const Vector<BaseFloat> &weights = weights_[samp_out_wrapped];
// first_input_index is the first index into "input" that we have a weight
// for.
int32 first_input_index = static_cast<int32>(first_samp_in -
input_sample_offset_);
BaseFloat this_output;
if (first_input_index >= 0 &&
first_input_index + weights.Dim() <= input_dim) {
SubVector<BaseFloat> input_part(input, first_input_index, weights.Dim());
this_output = VecVec(input_part, weights);
} else { // Handle edge cases.
this_output = 0.0;
for (int32 i = 0; i < weights.Dim(); i++) {
BaseFloat weight = weights(i);
int32 input_index = first_input_index + i;
if (input_index < 0 && input_remainder_.Dim() + input_index >= 0) {
this_output += weight *
input_remainder_(input_remainder_.Dim() + input_index);
} else if (input_index >= 0 && input_index < input_dim) {
this_output += weight * input(input_index);
} else if (input_index >= input_dim) {
// We're past the end of the input and are adding zero; should only
// happen if the user specified flush == true, or else we would not
// be trying to output this sample.
KALDI_ASSERT(flush);
}
}
}
int32 output_index = static_cast<int32>(samp_out - output_sample_offset_);
(*output)(output_index) = this_output;
}
if (flush) {
Reset(); // Reset the internal state.
} else {
SetRemainder(input);
input_sample_offset_ = tot_input_samp;
output_sample_offset_ = tot_output_samp;
}
}
void LinearResample::SetRemainder(const VectorBase<BaseFloat> &input) {
Vector<BaseFloat> old_remainder(input_remainder_);
// max_remainder_needed is the width of the filter from side to side,
// measured in input samples. you might think it should be half that,
// but you have to consider that you might be wanting to output samples
// that are "in the past" relative to the beginning of the latest
// input... anyway, storing more remainder than needed is not harmful.
int32 max_remainder_needed = ceil(samp_rate_in_ * num_zeros_ /
filter_cutoff_);
input_remainder_.Resize(max_remainder_needed);
for (int32 index = - input_remainder_.Dim(); index < 0; index++) {
// we interpret "index" as an offset from the end of "input" and
// from the end of input_remainder_.
int32 input_index = index + input.Dim();
if (input_index >= 0)
input_remainder_(index + input_remainder_.Dim()) = input(input_index);
else if (input_index + old_remainder.Dim() >= 0)
input_remainder_(index + input_remainder_.Dim()) =
old_remainder(input_index + old_remainder.Dim());
// else leave it at zero.
}
}
void LinearResample::Reset() {
input_sample_offset_ = 0;
output_sample_offset_ = 0;
input_remainder_.Resize(0);
}
/** Here, t is a time in seconds representing an offset from
the center of the windowed filter function, and FilterFunction(t)
returns the windowed filter function, described
in the header as h(t) = f(t)g(t), evaluated at t.
*/
BaseFloat LinearResample::FilterFunc(BaseFloat t) const {
BaseFloat window, // raised-cosine (Hanning) window of width
// num_zeros_/2*filter_cutoff_
filter; // sinc filter function
if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
else
window = 0.0; // outside support of window function
if (t != 0)
filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
else
filter = 2 * filter_cutoff_; // limit of the function at t = 0
return filter * window;
}
ArbitraryResample::ArbitraryResample(
int32 num_samples_in, BaseFloat samp_rate_in,
BaseFloat filter_cutoff, const Vector<BaseFloat> &sample_points,
int32 num_zeros):
num_samples_in_(num_samples_in),
samp_rate_in_(samp_rate_in),
filter_cutoff_(filter_cutoff),
num_zeros_(num_zeros) {
KALDI_ASSERT(num_samples_in > 0 && samp_rate_in > 0.0 &&
filter_cutoff > 0.0 &&
filter_cutoff * 2.0 <= samp_rate_in
&& num_zeros > 0);
// set up weights_ and indices_. Please try to keep all functions short and
SetIndexes(sample_points);
SetWeights(sample_points);
}
void ArbitraryResample::Resample(const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output) const {
// each row of "input" corresponds to the data to resample;
// the corresponding row of "output" is the resampled data.
KALDI_ASSERT(input.NumRows() == output->NumRows() &&
input.NumCols() == num_samples_in_ &&
output->NumCols() == weights_.size());
Vector<BaseFloat> output_col(output->NumRows());
for (int32 i = 0; i < NumSamplesOut(); i++) {
SubMatrix<BaseFloat> input_part(input, 0, input.NumRows(),
first_index_[i],
weights_[i].Dim());
const Vector<BaseFloat> &weight_vec(weights_[i]);
output_col.AddMatVec(1.0, input_part,
kNoTrans, weight_vec, 0.0);
output->CopyColFromVec(output_col, i);
}
}
void ArbitraryResample::Resample(const VectorBase<BaseFloat> &input,
VectorBase<BaseFloat> *output) const {
KALDI_ASSERT(input.Dim() == num_samples_in_ &&
output->Dim() == weights_.size());
int32 output_dim = output->Dim();
for (int32 i = 0; i < output_dim; i++) {
SubVector<BaseFloat> input_part(input, first_index_[i], weights_[i].Dim());
(*output)(i) = VecVec(input_part, weights_[i]);
}
}
void ArbitraryResample::SetIndexes(const Vector<BaseFloat> &sample_points) {
int32 num_samples = sample_points.Dim();
first_index_.resize(num_samples);
weights_.resize(num_samples);
BaseFloat filter_width = num_zeros_ / (2.0 * filter_cutoff_);
for (int32 i = 0; i < num_samples; i++) {
// the t values are in seconds.
BaseFloat t = sample_points(i),
t_min = t - filter_width, t_max = t + filter_width;
int32 index_min = ceil(samp_rate_in_ * t_min),
index_max = floor(samp_rate_in_ * t_max);
// the ceil on index min and the floor on index_max are because there
// is no point using indices just outside the window (coeffs would be zero).
if (index_min < 0)
index_min = 0;
if (index_max >= num_samples_in_)
index_max = num_samples_in_ - 1;
first_index_[i] = index_min;
weights_[i].Resize(index_max - index_min + 1);
}
}
void ArbitraryResample::SetWeights(const Vector<BaseFloat> &sample_points) {
int32 num_samples_out = NumSamplesOut();
for (int32 i = 0; i < num_samples_out; i++) {
for (int32 j = 0 ; j < weights_[i].Dim(); j++) {
BaseFloat delta_t = sample_points(i) -
(first_index_[i] + j) / samp_rate_in_;
// Include at this point the factor of 1.0 / samp_rate_in_ which
// appears in the math.
weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_;
}
}
}
/** Here, t is a time in seconds representing an offset from
the center of the windowed filter function, and FilterFunction(t)
returns the windowed filter function, described
in the header as h(t) = f(t)g(t), evaluated at t.
*/
BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const {
BaseFloat window, // raised-cosine (Hanning) window of width
// num_zeros_/2*filter_cutoff_
filter; // sinc filter function
if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
else
window = 0.0; // outside support of window function
if (t != 0.0)
filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
else
filter = 2.0 * filter_cutoff_; // limit of the function at zero.
return filter * window;
}
void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
BaseFloat min_freq = std::min(orig_freq, new_freq);
BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
int32 lowpass_filter_width = 6;
LinearResample resampler(orig_freq, new_freq,
lowpass_cutoff, lowpass_filter_width);
resampler.Resample(wave, true, new_wave);
}
} // namespace kaldi

@ -1,287 +0,0 @@
// feat/resample.h
// Copyright 2013 Pegah Ghahremani
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// 2014 Yanqing Sun, Junjie Wang
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_RESAMPLE_H_
#define KALDI_FEAT_RESAMPLE_H_
#include <cassert>
#include <cstdlib>
#include <string>
#include <vector>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/**
\file[resample.h]
This header contains declarations of classes for resampling signals. The
normal cases of resampling a signal are upsampling and downsampling
(increasing and decreasing the sample rate of a signal, respectively),
although the ArbitraryResample class allows a more generic case where
we want to get samples of a signal at uneven intervals (for instance,
log-spaced).
The input signal is always evenly spaced, say sampled with frequency S, and
we assume the original signal was band-limited to S/2 or lower. The n'th
input sample x_n (with n = 0, 1, ...) is interpreted as the original
signal's value at time n/S.
For resampling, it is convenient to view the input signal as a
continuous function x(t) of t, where each sample x_n becomes a delta function
with magnitude x_n/S, at time n/S. If we band limit this to the Nyquist
frequency S/2, we can show that this is the same as the original signal
that was sampled. [assuming the original signal was periodic and band
limited.] In general we want to bandlimit to lower than S/2, because
we don't have a perfect filter and also because if we want to resample
at a lower frequency than S, we need to bandlimit to below half of that.
Anyway, suppose we want to bandlimit to C, with 0 < C < S/2. The perfect
rectangular filter with cutoff C is the sinc function,
\f[ f(t) = 2C sinc(2Ct), \f]
where sinc is the normalized sinc function \f$ sinc(t) = sin(pi t) / (pi t) \f$, with
\f$ sinc(0) = 1 \f$. This is not a practical filter, though, because it has
infinite support. At the cost of less-than-perfect rolloff, we can choose
a suitable windowing function g(t), and use f(t) g(t) as the filter. For
a windowing function we choose raised-cosine (Hanning) window with support
on [-w/2C, w/2C], where w >= 2 is an integer chosen by the user. w = 1
means we window the sinc function out to its first zero on the left and right,
w = 2 means the second zero, and so on; we normally choose w to be at least two.
We call this num_zeros, not w, in the code.
Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting
signal s(t) at an arbitrary time t is easy: we have
\f[ s(t) = 1/S \sum_n x_n h(t - n/S) \f].
(note: the sign of t - n/S might be wrong, but it doesn't matter as the filter
and window are symmetric).
This is true for arbitrary values of t. What the class ArbitraryResample does
is to allow you to evaluate the signal for specified values of t.
*/
/**
Class ArbitraryResample allows you to resample a signal (assumed zero outside
the sample region, not periodic) at arbitrary specified time values, which
don't have to be linearly spaced. The low-pass filter cutoff
"filter_cutoff_hz" should be less than half the sample rate;
"num_zeros" should probably be at least two preferably more; higher numbers give
sharper filters but will be less efficient.
*/
class ArbitraryResample {
public:
ArbitraryResample(int32 num_samples_in,
BaseFloat samp_rate_hz,
BaseFloat filter_cutoff_hz,
const Vector<BaseFloat> &sample_points_secs,
int32 num_zeros);
int32 NumSamplesIn() const { return num_samples_in_; }
int32 NumSamplesOut() const { return weights_.size(); }
/// This function does the resampling.
/// input.NumRows() and output.NumRows() should be equal
/// and nonzero.
/// input.NumCols() should equal NumSamplesIn()
/// and output.NumCols() should equal NumSamplesOut().
void Resample(const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output) const;
/// This version of the Resample function processes just
/// one vector.
void Resample(const VectorBase<BaseFloat> &input,
VectorBase<BaseFloat> *output) const;
private:
void SetIndexes(const Vector<BaseFloat> &sample_points);
void SetWeights(const Vector<BaseFloat> &sample_points);
BaseFloat FilterFunc(BaseFloat t) const;
int32 num_samples_in_;
BaseFloat samp_rate_in_;
BaseFloat filter_cutoff_;
int32 num_zeros_;
std::vector<int32> first_index_; // The first input-sample index that we sum
// over, for this output-sample index.
std::vector<Vector<BaseFloat> > weights_;
};
/**
LinearResample is a special case of ArbitraryResample, where we want to
resample a signal at linearly spaced intervals (this means we want to
upsample or downsample the signal). It is more efficient than
ArbitraryResample because we can construct it just once.
We require that the input and output sampling rate be specified as
integers, as this is an easy way to specify that their ratio be rational.
*/
class LinearResample {
public:
/// Constructor. We make the input and output sample rates integers, because
/// we are going to need to find a common divisor. This should just remind
/// you that they need to be integers. The filter cutoff needs to be less
/// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros
/// controls the sharpness of the filter, more == sharper but less efficient.
/// We suggest around 4 to 10 for normal use.
LinearResample(int32 samp_rate_in_hz,
int32 samp_rate_out_hz,
BaseFloat filter_cutoff_hz,
int32 num_zeros);
/// This function does the resampling. If you call it with flush == true and
/// you have never called it with flush == false, it just resamples the input
/// signal (it resizes the output to a suitable number of samples).
///
/// You can also use this function to process a signal a piece at a time.
/// suppose you break it into piece1, piece2, ... pieceN. You can call
/// \code{.cc}
/// Resample(piece1, &output1, false);
/// Resample(piece2, &output2, false);
/// Resample(piece3, &output3, true);
/// \endcode
/// If you call it with flush == false, it won't output the last few samples
/// but will remember them, so that if you later give it a second piece of
/// the input signal it can process it correctly.
/// If your most recent call to the object was with flush == false, it will
/// have internal state; you can remove this by calling Reset().
/// Empty input is acceptable.
void Resample(const VectorBase<BaseFloat> &input,
bool flush,
Vector<BaseFloat> *output);
/// Calling the function Reset() resets the state of the object prior to
/// processing a new signal; it is only necessary if you have called
/// Resample(x, y, false) for some signal, leading to a remainder of the
/// signal being called, but then abandon processing the signal before calling
/// Resample(x, y, true) for the last piece. Call it unnecessarily between
/// signals will not do any harm.
void Reset();
//// Return the input and output sampling rates (for checks, for example)
inline int32 GetInputSamplingRate() { return samp_rate_in_; }
inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
private:
/// This function outputs the number of output samples we will output
/// for a signal with "input_num_samp" input samples. If flush == true,
/// we return the largest n such that
/// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ),
/// and note that the interval is half-open. If flush == false,
/// define window_width as num_zeros / (2.0 * filter_cutoff_);
/// we return the largest n such that (n/samp_rate_out_) is in the interval
/// [ 0, input_num_samp/samp_rate_in_ - window_width ).
int64 GetNumOutputSamples(int64 input_num_samp, bool flush) const;
/// Given an output-sample index, this function outputs to *first_samp_in the
/// first input-sample index that we have a weight on (may be negative),
/// and to *samp_out_wrapped the index into weights_ where we can get the
/// corresponding weights on the input.
inline void GetIndexes(int64 samp_out,
int64 *first_samp_in,
int32 *samp_out_wrapped) const;
void SetRemainder(const VectorBase<BaseFloat> &input);
void SetIndexesAndWeights();
BaseFloat FilterFunc(BaseFloat) const;
// The following variables are provided by the user.
int32 samp_rate_in_;
int32 samp_rate_out_;
BaseFloat filter_cutoff_;
int32 num_zeros_;
int32 input_samples_in_unit_; ///< The number of input samples in the
///< smallest repeating unit: num_samp_in_ =
///< samp_rate_in_hz / Gcd(samp_rate_in_hz,
///< samp_rate_out_hz)
int32 output_samples_in_unit_; ///< The number of output samples in the
///< smallest repeating unit: num_samp_out_ =
///< samp_rate_out_hz / Gcd(samp_rate_in_hz,
///< samp_rate_out_hz)
/// The first input-sample index that we sum over, for this output-sample
/// index. May be negative; any truncation at the beginning is handled
/// separately. This is just for the first few output samples, but we can
/// extrapolate the correct input-sample index for arbitrary output samples.
std::vector<int32> first_index_;
/// Weights on the input samples, for this output-sample index.
std::vector<Vector<BaseFloat> > weights_;
// the following variables keep track of where we are in a particular signal,
// if it is being provided over multiple calls to Resample().
int64 input_sample_offset_; ///< The number of input samples we have
///< already received for this signal
///< (including anything in remainder_)
int64 output_sample_offset_; ///< The number of samples we have already
///< output for this signal.
Vector<BaseFloat> input_remainder_; ///< A small trailing part of the
///< previously seen input signal.
};
/**
Downsample or upsample a waveform. This is a convenience wrapper for the
class 'LinearResample'.
The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist,
where the Nyquist is half of the minimum of (orig_freq, new_freq). The
resampling is done with a symmetric FIR filter with N_z (number of zeros)
as 6.
We compared the downsampling results with those from the sox resampling
toolkit.
Sox's design is inspired by Laurent De Soras' paper,
https://ccrma.stanford.edu/~jos/resample/Implementation.html
Note: we expect that while orig_freq and new_freq are of type BaseFloat, they
are actually required to have exact integer values (like 16000 or 8000) with
a ratio between them that can be expressed as a rational number with
reasonably small integer factors.
*/
void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
BaseFloat new_freq, Vector<BaseFloat> *new_wave);
/// This function is deprecated. It is provided for backward compatibility, to avoid
/// breaking older code.
inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
ResampleWaveform(orig_freq, wave, new_freq, new_wave);
}
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_RESAMPLE_H_

@ -1,129 +0,0 @@
// feat/signal.cc
// Copyright 2015 Tom Ko
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "feat/signal.h"
namespace kaldi {
void ElementwiseProductOfFft(const Vector<BaseFloat> &a, Vector<BaseFloat> *b) {
int32 num_fft_bins = a.Dim() / 2;
for (int32 i = 0; i < num_fft_bins; i++) {
// do complex multiplication
ComplexMul(a(2*i), a(2*i + 1), &((*b)(2*i)), &((*b)(2*i + 1)));
}
}
void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
int32 signal_length = signal->Dim();
int32 filter_length = filter.Dim();
int32 output_length = signal_length + filter_length - 1;
Vector<BaseFloat> signal_padded(output_length);
signal_padded.SetZero();
for (int32 i = 0; i < signal_length; i++) {
for (int32 j = 0; j < filter_length; j++) {
signal_padded(i + j) += (*signal)(i) * filter(j);
}
}
signal->Resize(output_length);
signal->CopyFromVec(signal_padded);
}
void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
int32 signal_length = signal->Dim();
int32 filter_length = filter.Dim();
int32 output_length = signal_length + filter_length - 1;
int32 fft_length = RoundUpToNearestPowerOfTwo(output_length);
KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length;
SplitRadixRealFft<BaseFloat> srfft(fft_length);
Vector<BaseFloat> filter_padded(fft_length);
filter_padded.Range(0, filter_length).CopyFromVec(filter);
srfft.Compute(filter_padded.Data(), true);
Vector<BaseFloat> signal_padded(fft_length);
signal_padded.Range(0, signal_length).CopyFromVec(*signal);
srfft.Compute(signal_padded.Data(), true);
ElementwiseProductOfFft(filter_padded, &signal_padded);
srfft.Compute(signal_padded.Data(), false);
signal_padded.Scale(1.0 / fft_length);
signal->Resize(output_length);
signal->CopyFromVec(signal_padded.Range(0, output_length));
}
void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
int32 signal_length = signal->Dim();
int32 filter_length = filter.Dim();
int32 output_length = signal_length + filter_length - 1;
signal->Resize(output_length, kCopyData);
KALDI_VLOG(1) << "Length of the filter is " << filter_length;
int32 fft_length = RoundUpToNearestPowerOfTwo(4 * filter_length);
KALDI_VLOG(1) << "Best FFT length is " << fft_length;
int32 block_length = fft_length - filter_length + 1;
KALDI_VLOG(1) << "Block size is " << block_length;
SplitRadixRealFft<BaseFloat> srfft(fft_length);
Vector<BaseFloat> filter_padded(fft_length);
filter_padded.Range(0, filter_length).CopyFromVec(filter);
srfft.Compute(filter_padded.Data(), true);
Vector<BaseFloat> temp_pad(filter_length - 1);
temp_pad.SetZero();
Vector<BaseFloat> signal_block_padded(fft_length);
for (int32 po = 0; po < output_length; po += block_length) {
// get a block of the signal
int32 process_length = std::min(block_length, output_length - po);
signal_block_padded.SetZero();
signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length));
srfft.Compute(signal_block_padded.Data(), true);
ElementwiseProductOfFft(filter_padded, &signal_block_padded);
srfft.Compute(signal_block_padded.Data(), false);
signal_block_padded.Scale(1.0 / fft_length);
// combine the block
if (po + block_length < output_length) { // current block is not the last block
signal->Range(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length));
signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1));
} else {
signal->Range(po, output_length - po).CopyFromVec(
signal_block_padded.Range(0, output_length - po));
if (filter_length - 1 < output_length - po)
signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
else
signal->Range(po, output_length - po).AddVec(1.0, temp_pad.Range(0, output_length - po));
}
}
}
}

@ -1,58 +0,0 @@
// feat/signal.h
// Copyright 2015 Tom Ko
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_SIGNAL_H_
#define KALDI_FEAT_SIGNAL_H_
#include "base/kaldi-common.h"
#include "util/common-utils.h"
namespace kaldi {
/*
The following three functions are having the same functionality but
different implementations so as the efficiency. After the convolution,
the length of the signal will be extended to (original signal length +
filter length - 1).
*/
/*
This function implements a simple non-FFT-based convolution of two signals.
It is suggested to use the FFT-based convolution function which is more
efficient.
*/
void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
/*
This function implements FFT-based convolution of two signals.
However this should be an inefficient version of BlockConvolveSignals()
as it processes the entire signal with a single FFT.
*/
void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
/*
This function implements FFT-based block convolution of two signals using
overlap-add method. This is an efficient way to evaluate the discrete
convolution of a long signal with a finite impulse response filter.
*/
void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
} // namespace kaldi
#endif // KALDI_FEAT_SIGNAL_H_

@ -1,16 +0,0 @@
add_library(kaldi-matrix
compressed-matrix.cc
kaldi-matrix.cc
kaldi-vector.cc
matrix-functions.cc
optimization.cc
packed-matrix.cc
qr.cc
sparse-matrix.cc
sp-matrix.cc
srfft.cc
tp-matrix.cc
)
target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a)

@ -1,491 +0,0 @@
// matrix/cblas-wrappers.h
// Copyright 2012 Johns Hopkins University (author: Daniel Povey);
// Haihua Xu; Wei Shi
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
#include <limits>
#include "matrix/sp-matrix.h"
#include "matrix/kaldi-vector.h"
#include "matrix/kaldi-matrix.h"
#include "matrix/matrix-functions.h"
#include "matrix/kaldi-blas.h"
// Do not include this file directly. It is to be included
// by .cc files in this directory.
namespace kaldi {
inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
const int incY) {
cblas_scopy(N, X, incX, Y, incY);
}
inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
const int incY) {
cblas_dcopy(N, X, incX, Y, incY);
}
inline float cblas_Xasum(const int N, const float *X, const int incX) {
return cblas_sasum(N, X, incX);
}
inline double cblas_Xasum(const int N, const double *X, const int incX) {
return cblas_dasum(N, X, incX);
}
inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
const int incY, const float c, const float s) {
cblas_srot(N, X, incX, Y, incY, c, s);
}
inline void cblas_Xrot(const int N, double *X, const int incX, double *Y,
const int incY, const double c, const double s) {
cblas_drot(N, X, incX, Y, incY, c, s);
}
inline float cblas_Xdot(const int N, const float *const X,
const int incX, const float *const Y,
const int incY) {
return cblas_sdot(N, X, incX, Y, incY);
}
inline double cblas_Xdot(const int N, const double *const X,
const int incX, const double *const Y,
const int incY) {
return cblas_ddot(N, X, incX, Y, incY);
}
inline void cblas_Xaxpy(const int N, const float alpha, const float *X,
const int incX, float *Y, const int incY) {
cblas_saxpy(N, alpha, X, incX, Y, incY);
}
inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
const int incX, double *Y, const int incY) {
cblas_daxpy(N, alpha, X, incX, Y, incY);
}
inline void cblas_Xscal(const int N, const float alpha, float *data,
const int inc) {
cblas_sscal(N, alpha, data, inc);
}
inline void cblas_Xscal(const int N, const double alpha, double *data,
const int inc) {
cblas_dscal(N, alpha, data, inc);
}
inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata,
const float *v, const int v_inc,
const float beta, float *y, const int y_inc) {
cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
}
inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata,
const double *v, const int v_inc,
const double beta, double *y, const int y_inc) {
cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
}
inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
const int num_rows, float *y, const int y_inc) {
cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
CblasNonUnit, num_rows, Mdata, y, y_inc);
}
inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata,
const int num_rows, double *y, const int y_inc) {
cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
CblasNonUnit, num_rows, Mdata, y, y_inc);
}
inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata,
const int num_rows, float *y, const int y_inc) {
cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
CblasNonUnit, num_rows, Mdata, y, y_inc);
}
inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata,
const int num_rows, double *y, const int y_inc) {
cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
CblasNonUnit, num_rows, Mdata, y, y_inc);
}
// x = alpha * M * y + beta * x
inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata,
const float *ydata, MatrixIndexT ystride,
float beta, float *xdata, MatrixIndexT xstride) {
cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
ydata, ystride, beta, xdata, xstride);
}
inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata,
const double *ydata, MatrixIndexT ystride,
double beta, double *xdata, MatrixIndexT xstride) {
cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
ydata, ystride, beta, xdata, xstride);
}
// Implements A += alpha * (x y' + y x'); A is symmetric matrix.
inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata,
MatrixIndexT incX, const float *Ydata, MatrixIndexT incY,
float *Adata) {
cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
incX, Ydata, incY, Adata);
}
inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata,
MatrixIndexT incX, const double *Ydata, MatrixIndexT incY,
double *Adata) {
cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
incX, Ydata, incY, Adata);
}
// Implements A += alpha * (x x'); A is symmetric matrix.
inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata,
MatrixIndexT incX, float *Adata) {
cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
}
inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata,
MatrixIndexT incX, double *Adata) {
cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
}
// sgemv,dgemv: y = alpha M x + beta y.
inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
MatrixIndexT num_cols, float alpha, const float *Mdata,
MatrixIndexT stride, const float *xdata,
MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
}
inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
MatrixIndexT num_cols, double alpha, const double *Mdata,
MatrixIndexT stride, const double *xdata,
MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
}
// sgbmv, dgmmv: y = alpha M x + + beta * y.
inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
MatrixIndexT num_cols, MatrixIndexT num_below,
MatrixIndexT num_above, float alpha, const float *Mdata,
MatrixIndexT stride, const float *xdata,
MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
incX, beta, ydata, incY);
}
inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
MatrixIndexT num_cols, MatrixIndexT num_below,
MatrixIndexT num_above, double alpha, const double *Mdata,
MatrixIndexT stride, const double *xdata,
MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
incX, beta, ydata, incY);
}
template<typename Real>
inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
MatrixIndexT num_cols, Real alpha, const Real *Mdata,
MatrixIndexT stride, const Real *xdata,
MatrixIndexT incX, Real beta, Real *ydata,
MatrixIndexT incY) {
if (trans == kNoTrans) {
if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
for (MatrixIndexT i = 0; i < num_cols; i++) {
Real x_i = xdata[i * incX];
if (x_i == 0.0) continue;
// Add to ydata, the i'th column of M, times alpha * x_i
cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
}
} else {
if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
for (MatrixIndexT i = 0; i < num_rows; i++) {
Real x_i = xdata[i * incX];
if (x_i == 0.0) continue;
// Add to ydata, the i'th row of M, times alpha * x_i
cblas_Xaxpy(num_cols, x_i * alpha,
Mdata + (i * stride), 1, ydata, incY);
}
}
}
inline void cblas_Xgemm(const float alpha,
MatrixTransposeType transA,
const float *Adata,
MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
MatrixTransposeType transB,
const float *Bdata, MatrixIndexT b_stride,
const float beta,
float *Mdata,
MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
static_cast<CBLAS_TRANSPOSE>(transB),
num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
alpha, Adata, a_stride, Bdata, b_stride,
beta, Mdata, stride);
}
inline void cblas_Xgemm(const double alpha,
MatrixTransposeType transA,
const double *Adata,
MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
MatrixTransposeType transB,
const double *Bdata, MatrixIndexT b_stride,
const double beta,
double *Mdata,
MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
static_cast<CBLAS_TRANSPOSE>(transB),
num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
alpha, Adata, a_stride, Bdata, b_stride,
beta, Mdata, stride);
}
inline void cblas_Xsymm(const float alpha,
MatrixIndexT sz,
const float *Adata,MatrixIndexT a_stride,
const float *Bdata,MatrixIndexT b_stride,
const float beta,
float *Mdata, MatrixIndexT stride) {
cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
a_stride, Bdata, b_stride, beta, Mdata, stride);
}
inline void cblas_Xsymm(const double alpha,
MatrixIndexT sz,
const double *Adata,MatrixIndexT a_stride,
const double *Bdata,MatrixIndexT b_stride,
const double beta,
double *Mdata, MatrixIndexT stride) {
cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
a_stride, Bdata, b_stride, beta, Mdata, stride);
}
// ger: M += alpha x y^T.
inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha,
const float *xdata, MatrixIndexT incX, const float *ydata,
MatrixIndexT incY, float *Mdata, MatrixIndexT stride) {
cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
Mdata, stride);
}
inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha,
const double *xdata, MatrixIndexT incX, const double *ydata,
MatrixIndexT incY, double *Mdata, MatrixIndexT stride) {
cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
Mdata, stride);
}
// syrk: symmetric rank-k update.
// if trans==kNoTrans, then C = alpha A A^T + beta C
// else C = alpha A^T A + beta C.
// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
// num-cols(A) if kNoTrans, or num-rows(A) if kTrans.
// We only need the row-major and lower-triangular option of this, and this
// is hard-coded.
inline void cblas_Xsyrk (
const MatrixTransposeType trans, const MatrixIndexT dim_c,
const MatrixIndexT other_dim_a, const float alpha, const float *A,
const MatrixIndexT a_stride, const float beta, float *C,
const MatrixIndexT c_stride) {
cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
}
inline void cblas_Xsyrk(
const MatrixTransposeType trans, const MatrixIndexT dim_c,
const MatrixIndexT other_dim_a, const double alpha, const double *A,
const MatrixIndexT a_stride, const double beta, double *C,
const MatrixIndexT c_stride) {
cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
}
/// matrix-vector multiply using a banded matrix; we always call this
/// with b = 1 meaning we're multiplying by a diagonal matrix. This is used for
/// elementwise multiplication. We miss some of the arguments out of this
/// wrapper.
inline void cblas_Xsbmv1(
const MatrixIndexT dim,
const double *A,
const double alpha,
const double *x,
const double beta,
double *y) {
cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
1, x, 1, beta, y, 1);
}
inline void cblas_Xsbmv1(
const MatrixIndexT dim,
const float *A,
const float alpha,
const float *x,
const float beta,
float *y) {
cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
1, x, 1, beta, y, 1);
}
/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
/// extend this somehow.
inline void mul_elements(
const MatrixIndexT dim,
const double *a,
double *b) { // does b *= a, elementwise.
double c1, c2, c3, c4;
MatrixIndexT i;
for (i = 0; i + 4 <= dim; i += 4) {
c1 = a[i] * b[i];
c2 = a[i+1] * b[i+1];
c3 = a[i+2] * b[i+2];
c4 = a[i+3] * b[i+3];
b[i] = c1;
b[i+1] = c2;
b[i+2] = c3;
b[i+3] = c4;
}
for (; i < dim; i++)
b[i] *= a[i];
}
inline void mul_elements(
const MatrixIndexT dim,
const float *a,
float *b) { // does b *= a, elementwise.
float c1, c2, c3, c4;
MatrixIndexT i;
for (i = 0; i + 4 <= dim; i += 4) {
c1 = a[i] * b[i];
c2 = a[i+1] * b[i+1];
c3 = a[i+2] * b[i+2];
c4 = a[i+3] * b[i+3];
b[i] = c1;
b[i+1] = c2;
b[i+2] = c3;
b[i+3] = c4;
}
for (; i < dim; i++)
b[i] *= a[i];
}
// add clapack here
#if !defined(HAVE_ATLAS)
inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
stptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
}
inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
}
//
inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
KaldiBlasInt *result) {
sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
}
inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
KaldiBlasInt *result) {
dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
}
//
inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
KaldiBlasInt *pivot, float *p_work,
KaldiBlasInt *l_work, KaldiBlasInt *result) {
sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
}
inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
KaldiBlasInt *pivot, double *p_work,
KaldiBlasInt *l_work, KaldiBlasInt *result) {
dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
}
//
inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
float *sv, float *Vdata, KaldiBlasInt *vstride,
float *Udata, KaldiBlasInt *ustride, float *p_work,
KaldiBlasInt *l_work, KaldiBlasInt *result) {
sgesvd_(v, u,
num_cols, num_rows, Mdata, stride,
sv, Vdata, vstride, Udata, ustride,
p_work, l_work, result);
}
inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
double *sv, double *Vdata, KaldiBlasInt *vstride,
double *Udata, KaldiBlasInt *ustride, double *p_work,
KaldiBlasInt *l_work, KaldiBlasInt *result) {
dgesvd_(v, u,
num_cols, num_rows, Mdata, stride,
sv, Vdata, vstride, Udata, ustride,
p_work, l_work, result);
}
//
void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata,
KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
}
void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata,
KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
}
//
void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata,
KaldiBlasInt *ipiv, KaldiBlasInt *result) {
ssptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
}
void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
KaldiBlasInt *ipiv, KaldiBlasInt *result) {
dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
}
#else
inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
float *Mdata, MatrixIndexT stride,
int *pivot, int *result) {
*result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
Mdata, stride, pivot);
}
inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
double *Mdata, MatrixIndexT stride,
int *pivot, int *result) {
*result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
Mdata, stride, pivot);
}
//
inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) {
return clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
Mdata, stride);
}
inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) {
return clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
Mdata, stride);
}
//
inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride,
int *pivot, int *result) {
*result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
}
inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride,
int *pivot, int *result) {
*result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
}
#endif
}
// namespace kaldi
#endif

@ -1,876 +0,0 @@
// matrix/compressed-matrix.cc
// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
// Frantisek Skala, Wei Shi
// 2015 Tom Ko
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "matrix/compressed-matrix.h"
#include <algorithm>
namespace kaldi {
//static
MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) {
// Returns size in bytes of the data.
DataFormat format = static_cast<DataFormat>(header.format);
if (format == kOneByteWithColHeaders) {
return sizeof(GlobalHeader) +
header.num_cols * (sizeof(PerColHeader) + header.num_rows);
} else if (format == kTwoByte) {
return sizeof(GlobalHeader) +
2 * header.num_rows * header.num_cols;
} else {
KALDI_ASSERT(format == kOneByte);
return sizeof(GlobalHeader) +
header.num_rows * header.num_cols;
}
}
// scale all element of matrix by scaling floats
// in GlobalHeader with alpha.
void CompressedMatrix::Scale(float alpha) {
if (data_ != NULL) {
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
// scale the floating point values in each PerColHolder
// and leave all integers the same.
h->min_value *= alpha;
h->range *= alpha;
}
}
template<typename Real> // static inline
void CompressedMatrix::ComputeGlobalHeader(
const MatrixBase<Real> &mat, CompressionMethod method,
GlobalHeader *header) {
if (method == kAutomaticMethod) {
if (mat.NumRows() > 8) method = kSpeechFeature;
else method = kTwoByteAuto;
}
switch (method) {
case kSpeechFeature:
header->format = static_cast<int32>(kOneByteWithColHeaders); // 1.
break;
case kTwoByteAuto: case kTwoByteSignedInteger:
header->format = static_cast<int32>(kTwoByte); // 2.
break;
case kOneByteAuto: case kOneByteUnsignedInteger: case kOneByteZeroOne:
header->format = static_cast<int32>(kOneByte); // 3.
break;
default:
KALDI_ERR << "Invalid compression type: "
<< static_cast<int32>(method);
}
header->num_rows = mat.NumRows();
header->num_cols = mat.NumCols();
// Now compute 'min_value' and 'range'.
switch (method) {
case kSpeechFeature: case kTwoByteAuto: case kOneByteAuto: {
float min_value = mat.Min(), max_value = mat.Max();
// ensure that max_value is strictly greater than min_value, even if matrix is
// constant; this avoids crashes in ComputeColHeader when compressing speech
// featupres.
if (max_value == min_value)
max_value = min_value + (1.0 + fabs(min_value));
KALDI_ASSERT(min_value - min_value == 0 &&
max_value - max_value == 0 &&
"Cannot compress a matrix with Nan's or Inf's");
header->min_value = min_value;
header->range = max_value - min_value;
// we previously checked that max_value != min_value, so their
// difference should be nonzero.
KALDI_ASSERT(header->range > 0.0);
break;
}
case kTwoByteSignedInteger: {
header->min_value = -32768.0;
header->range = 65535.0;
break;
}
case kOneByteUnsignedInteger: {
header->min_value = 0.0;
header->range = 255.0;
break;
}
case kOneByteZeroOne: {
header->min_value = 0.0;
header->range = 1.0;
break;
}
default:
KALDI_ERR << "Unknown compression method = "
<< static_cast<int32>(method);
}
KALDI_COMPILE_TIME_ASSERT(sizeof(*header) == 20); // otherwise
// something weird is happening and our code probably won't work or
// won't be robust across platforms.
}
template<typename Real>
void CompressedMatrix::CopyFromMat(
const MatrixBase<Real> &mat, CompressionMethod method) {
if (data_ != NULL) {
delete [] static_cast<float*>(data_); // call delete [] because was allocated with new float[]
data_ = NULL;
}
if (mat.NumRows() == 0) { return; } // Zero-size matrix stored as zero pointer.
GlobalHeader global_header;
ComputeGlobalHeader(mat, method, &global_header);
int32 data_size = DataSize(global_header);
data_ = AllocateData(data_size);
*(reinterpret_cast<GlobalHeader*>(data_)) = global_header;
DataFormat format = static_cast<DataFormat>(global_header.format);
if (format == kOneByteWithColHeaders) {
PerColHeader *header_data =
reinterpret_cast<PerColHeader*>(static_cast<char*>(data_) +
sizeof(GlobalHeader));
uint8 *byte_data =
reinterpret_cast<uint8*>(header_data + global_header.num_cols);
const Real *matrix_data = mat.Data();
for (int32 col = 0; col < global_header.num_cols; col++) {
CompressColumn(global_header,
matrix_data + col, mat.Stride(),
global_header.num_rows,
header_data, byte_data);
header_data++;
byte_data += global_header.num_rows;
}
} else if (format == kTwoByte) {
uint16 *data = reinterpret_cast<uint16*>(static_cast<char*>(data_) +
sizeof(GlobalHeader));
int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
for (int32 r = 0; r < num_rows; r++) {
const Real *row_data = mat.RowData(r);
for (int32 c = 0; c < num_cols; c++)
data[c] = FloatToUint16(global_header, row_data[c]);
data += num_cols;
}
} else {
KALDI_ASSERT(format == kOneByte);
uint8 *data = reinterpret_cast<uint8*>(static_cast<char*>(data_) +
sizeof(GlobalHeader));
int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
for (int32 r = 0; r < num_rows; r++) {
const Real *row_data = mat.RowData(r);
for (int32 c = 0; c < num_cols; c++)
data[c] = FloatToUint8(global_header, row_data[c]);
data += num_cols;
}
}
}
// Instantiate the template for float and double.
template
void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat,
CompressionMethod method);
template
void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat,
CompressionMethod method);
CompressedMatrix::CompressedMatrix(
const CompressedMatrix &cmat,
const MatrixIndexT row_offset,
const MatrixIndexT num_rows,
const MatrixIndexT col_offset,
const MatrixIndexT num_cols,
bool allow_padding): data_(NULL) {
int32 old_num_rows = cmat.NumRows(), old_num_cols = cmat.NumCols();
if (old_num_rows == 0) {
KALDI_ASSERT(num_rows == 0 && num_cols == 0);
// The empty matrix is stored as a zero pointer.
return;
}
KALDI_ASSERT(row_offset < old_num_rows);
KALDI_ASSERT(col_offset < old_num_cols);
KALDI_ASSERT(row_offset >= 0 || allow_padding);
KALDI_ASSERT(col_offset >= 0);
KALDI_ASSERT(row_offset + num_rows <= old_num_rows || allow_padding);
KALDI_ASSERT(col_offset + num_cols <= old_num_cols);
if (num_rows == 0 || num_cols == 0) { return; }
bool padding_is_used = (row_offset < 0 ||
row_offset + num_rows > old_num_rows);
GlobalHeader new_global_header;
KALDI_COMPILE_TIME_ASSERT(sizeof(new_global_header) == 20);
GlobalHeader *old_global_header = reinterpret_cast<GlobalHeader*>(cmat.Data());
new_global_header = *old_global_header;
new_global_header.num_cols = num_cols;
new_global_header.num_rows = num_rows;
// We don't switch format from 1 -> 2 (in case of size reduction) yet; if this
// is needed, we will do this below by creating a temporary Matrix.
new_global_header.format = old_global_header->format;
data_ = AllocateData(DataSize(new_global_header)); // allocate memory
*(reinterpret_cast<GlobalHeader*>(data_)) = new_global_header;
DataFormat format = static_cast<DataFormat>(old_global_header->format);
if (format == kOneByteWithColHeaders) {
PerColHeader *old_per_col_header =
reinterpret_cast<PerColHeader*>(old_global_header + 1);
uint8 *old_byte_data =
reinterpret_cast<uint8*>(old_per_col_header +
old_global_header->num_cols);
PerColHeader *new_per_col_header =
reinterpret_cast<PerColHeader*>(
reinterpret_cast<GlobalHeader*>(data_) + 1);
memcpy(new_per_col_header, old_per_col_header + col_offset,
sizeof(PerColHeader) * num_cols);
uint8 *new_byte_data =
reinterpret_cast<uint8*>(new_per_col_header + num_cols);
if (!padding_is_used) {
uint8 *old_start_of_subcol =
old_byte_data + row_offset + (col_offset * old_num_rows),
*new_start_of_col = new_byte_data;
for (int32 i = 0; i < num_cols; i++) {
memcpy(new_start_of_col, old_start_of_subcol, num_rows);
new_start_of_col += num_rows;
old_start_of_subcol += old_num_rows;
}
} else {
uint8 *old_start_of_col =
old_byte_data + (col_offset * old_num_rows),
*new_start_of_col = new_byte_data;
for (int32 i = 0; i < num_cols; i++) {
for (int32 j = 0; j < num_rows; j++) {
int32 old_j = j + row_offset;
if (old_j < 0) old_j = 0;
else if (old_j >= old_num_rows) old_j = old_num_rows - 1;
new_start_of_col[j] = old_start_of_col[old_j];
}
new_start_of_col += num_rows;
old_start_of_col += old_num_rows;
}
}
} else if (format == kTwoByte) {
const uint16 *old_data =
reinterpret_cast<const uint16*>(old_global_header + 1);
uint16 *new_row_data =
reinterpret_cast<uint16*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
for (int32 row = 0; row < num_rows; row++) {
int32 old_row = row + row_offset;
// The next two lines are only relevant if padding_is_used.
if (old_row < 0) old_row = 0;
else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
const uint16 *old_row_data =
old_data + col_offset + (old_num_cols * old_row);
memcpy(new_row_data, old_row_data, sizeof(uint16) * num_cols);
new_row_data += num_cols;
}
} else {
KALDI_ASSERT(format == kOneByte);
const uint8 *old_data =
reinterpret_cast<const uint8*>(old_global_header + 1);
uint8 *new_row_data =
reinterpret_cast<uint8*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
for (int32 row = 0; row < num_rows; row++) {
int32 old_row = row + row_offset;
// The next two lines are only relevant if padding_is_used.
if (old_row < 0) old_row = 0;
else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
const uint8 *old_row_data =
old_data + col_offset + (old_num_cols * old_row);
memcpy(new_row_data, old_row_data, sizeof(uint8) * num_cols);
new_row_data += num_cols;
}
}
if (num_rows < 8 && format == kOneByteWithColHeaders) {
// format was 1 but we want it to be 2 -> create a temporary
// Matrix (uncompress), re-compress, and swap.
// This gives us almost exact reconstruction while saving
// memory (the elements take more space but there will be
// no per-column headers).
Matrix<float> temp(this->NumRows(), this->NumCols(),
kUndefined);
this->CopyToMat(&temp);
CompressedMatrix temp_cmat(temp, kTwoByteAuto);
this->Swap(&temp_cmat);
}
}
template<typename Real>
CompressedMatrix &CompressedMatrix::operator =(const MatrixBase<Real> &mat) {
this->CopyFromMat(mat);
return *this;
}
// Instantiate the template for float and double.
template
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<float> &mat);
template
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<double> &mat);
inline uint16 CompressedMatrix::FloatToUint16(
const GlobalHeader &global_header,
float value) {
float f = (value - global_header.min_value) /
global_header.range;
if (f > 1.0) f = 1.0; // Note: this should not happen.
if (f < 0.0) f = 0.0; // Note: this should not happen.
return static_cast<int>(f * 65535 + 0.499); // + 0.499 is to
// round to closest int; avoids bias.
}
inline uint8 CompressedMatrix::FloatToUint8(
const GlobalHeader &global_header,
float value) {
float f = (value - global_header.min_value) /
global_header.range;
if (f > 1.0) f = 1.0; // Note: this should not happen.
if (f < 0.0) f = 0.0; // Note: this should not happen.
return static_cast<int>(f * 255 + 0.499); // + 0.499 is to
// round to closest int; avoids bias.
}
inline float CompressedMatrix::Uint16ToFloat(
const GlobalHeader &global_header,
uint16 value) {
// the constant 1.52590218966964e-05 is 1/65535.
return global_header.min_value
+ global_header.range * 1.52590218966964e-05F * value;
}
template<typename Real> // static
void CompressedMatrix::ComputeColHeader(
const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
int32 num_rows, CompressedMatrix::PerColHeader *header) {
KALDI_ASSERT(num_rows > 0);
std::vector<Real> sdata(num_rows); // the sorted data.
for (size_t i = 0, size = sdata.size(); i < size; i++)
sdata[i] = data[i*stride];
if (num_rows >= 5) {
int quarter_nr = num_rows/4;
// std::sort(sdata.begin(), sdata.end());
// The elements at positions 0, quarter_nr,
// 3*quarter_nr, and num_rows-1 need to be in sorted order.
std::nth_element(sdata.begin(), sdata.begin() + quarter_nr, sdata.end());
// Now, sdata.begin() + quarter_nr contains the element that would appear
// in sorted order, in that position.
std::nth_element(sdata.begin(), sdata.begin(), sdata.begin() + quarter_nr);
// Now, sdata.begin() and sdata.begin() + quarter_nr contain the elements
// that would appear at those positions in sorted order.
std::nth_element(sdata.begin() + quarter_nr + 1,
sdata.begin() + (3*quarter_nr), sdata.end());
// Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
// 3*quarter_nr, contain the elements that would appear at those positions
// in sorted order.
std::nth_element(sdata.begin() + (3*quarter_nr) + 1, sdata.end() - 1,
sdata.end());
// Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
// 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear
// at those positions in sorted order.
header->percentile_0 =
std::min<uint16>(FloatToUint16(global_header, sdata[0]), 65532);
header->percentile_25 =
std::min<uint16>(
std::max<uint16>(
FloatToUint16(global_header, sdata[quarter_nr]),
header->percentile_0 + static_cast<uint16>(1)), 65533);
header->percentile_75 =
std::min<uint16>(
std::max<uint16>(
FloatToUint16(global_header, sdata[3*quarter_nr]),
header->percentile_25 + static_cast<uint16>(1)), 65534);
header->percentile_100 = std::max<uint16>(
FloatToUint16(global_header, sdata[num_rows-1]),
header->percentile_75 + static_cast<uint16>(1));
} else { // handle this pathological case.
std::sort(sdata.begin(), sdata.end());
// Note: we know num_rows is at least 1.
header->percentile_0 =
std::min<uint16>(FloatToUint16(global_header, sdata[0]),
65532);
if (num_rows > 1)
header->percentile_25 =
std::min<uint16>(
std::max<uint16>(FloatToUint16(global_header, sdata[1]),
header->percentile_0 + 1), 65533);
else
header->percentile_25 = header->percentile_0 + 1;
if (num_rows > 2)
header->percentile_75 =
std::min<uint16>(
std::max<uint16>(FloatToUint16(global_header, sdata[2]),
header->percentile_25 + 1), 65534);
else
header->percentile_75 = header->percentile_25 + 1;
if (num_rows > 3)
header->percentile_100 =
std::max<uint16>(FloatToUint16(global_header, sdata[3]),
header->percentile_75 + 1);
else
header->percentile_100 = header->percentile_75 + 1;
}
}
// static
inline uint8 CompressedMatrix::FloatToChar(
float p0, float p25, float p75, float p100,
float value) {
int ans;
if (value < p25) { // range [ p0, p25 ) covered by
// characters 0 .. 64. We round to the closest int.
float f = (value - p0) / (p25 - p0);
ans = static_cast<int>(f * 64 + 0.5);
// Note: the checks on the next two lines
// are necessary in pathological cases when all the elements in a row
// are the same and the percentile_* values are separated by one.
if (ans < 0) ans = 0;
if (ans > 64) ans = 64;
} else if (value < p75) { // range [ p25, p75 )covered
// by characters 64 .. 192. We round to the closest int.
float f = (value - p25) / (p75 - p25);
ans = 64 + static_cast<int>(f * 128 + 0.5);
if (ans < 64) ans = 64;
if (ans > 192) ans = 192;
} else { // range [ p75, p100 ] covered by
// characters 192 .. 255. Note: this last range
// has fewer characters than the left range, because
// we go up to 255, not 256.
float f = (value - p75) / (p100 - p75);
ans = 192 + static_cast<int>(f * 63 + 0.5);
if (ans < 192) ans = 192;
if (ans > 255) ans = 255;
}
return static_cast<uint8>(ans);
}
// static
inline float CompressedMatrix::CharToFloat(
float p0, float p25, float p75, float p100,
uint8 value) {
if (value <= 64) {
return p0 + (p25 - p0) * value * (1/64.0);
} else if (value <= 192) {
return p25 + (p75 - p25) * (value - 64) * (1/128.0);
} else {
return p75 + (p100 - p75) * (value - 192) * (1/63.0);
}
}
template<typename Real> // static
void CompressedMatrix::CompressColumn(
const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
int32 num_rows, CompressedMatrix::PerColHeader *header,
uint8 *byte_data) {
ComputeColHeader(global_header, data, stride,
num_rows, header);
float p0 = Uint16ToFloat(global_header, header->percentile_0),
p25 = Uint16ToFloat(global_header, header->percentile_25),
p75 = Uint16ToFloat(global_header, header->percentile_75),
p100 = Uint16ToFloat(global_header, header->percentile_100);
for (int32 i = 0; i < num_rows; i++) {
Real this_data = data[i * stride];
byte_data[i] = FloatToChar(p0, p25, p75, p100, this_data);
}
}
// static
void* CompressedMatrix::AllocateData(int32 num_bytes) {
KALDI_ASSERT(num_bytes > 0);
KALDI_COMPILE_TIME_ASSERT(sizeof(float) == 4);
// round size up to nearest number of floats.
return reinterpret_cast<void*>(new float[(num_bytes/3) + 4]);
}
void CompressedMatrix::Write(std::ostream &os, bool binary) const {
if (binary) { // Binary-mode write:
if (data_ != NULL) {
GlobalHeader &h = *reinterpret_cast<GlobalHeader*>(data_);
DataFormat format = static_cast<DataFormat>(h.format);
if (format == kOneByteWithColHeaders) {
WriteToken(os, binary, "CM");
} else if (format == kTwoByte) {
WriteToken(os, binary, "CM2");
} else if (format == kOneByte) {
WriteToken(os, binary, "CM3");
}
MatrixIndexT size = DataSize(h); // total size of data in data_
// We don't write out the "int32 format", hence the + 4, - 4.
os.write(reinterpret_cast<const char*>(data_) + 4, size - 4);
} else { // special case: where data_ == NULL, we treat it as an empty
// matrix.
WriteToken(os, binary, "CM");
GlobalHeader h;
h.range = h.min_value = 0.0;
h.num_rows = h.num_cols = 0;
os.write(reinterpret_cast<const char*>(&h), sizeof(h));
}
} else {
// In text mode, just use the same format as a regular matrix.
// This is not compressed.
Matrix<BaseFloat> temp_mat(this->NumRows(), this->NumCols(),
kUndefined);
this->CopyToMat(&temp_mat);
temp_mat.Write(os, binary);
}
if (os.fail())
KALDI_ERR << "Error writing compressed matrix to stream.";
}
void CompressedMatrix::Read(std::istream &is, bool binary) {
if (data_ != NULL) {
delete [] (static_cast<float*>(data_));
data_ = NULL;
}
if (binary) {
int peekval = Peek(is, binary);
if (peekval == 'C') {
std::string tok; // Should be CM (format 1) or CM2 (format 2)
ReadToken(is, binary, &tok);
GlobalHeader h;
if (tok == "CM") { h.format = 1; } // kOneByteWithColHeaders
else if (tok == "CM2") { h.format = 2; } // kTwoByte
else if (tok == "CM3") { h.format = 3; } // kOneByte
else {
KALDI_ERR << "Unexpected token " << tok << ", expecting CM, CM2 or CM3";
}
// don't read the "format" -> hence + 4, - 4.
is.read(reinterpret_cast<char*>(&h) + 4, sizeof(h) - 4);
if (is.fail())
KALDI_ERR << "Failed to read header";
if (h.num_cols == 0) // empty matrix.
return;
int32 size = DataSize(h), remaining_size = size - sizeof(GlobalHeader);
data_ = AllocateData(size);
*(reinterpret_cast<GlobalHeader*>(data_)) = h;
is.read(reinterpret_cast<char*>(data_) + sizeof(GlobalHeader),
remaining_size);
} else {
// Assume that what we're reading is a regular Matrix. This might be the
// case if you changed your code, making a Matrix into a CompressedMatrix,
// and you want back-compatibility for reading.
Matrix<BaseFloat> M;
M.Read(is, binary); // This will crash if it was not a Matrix.
this->CopyFromMat(M);
}
} else { // Text-mode read. In this case you don't get to
// choose the compression type. Anyway this branch would only
// be taken when debugging.
Matrix<BaseFloat> temp;
temp.Read(is, binary);
this->CopyFromMat(temp);
}
if (is.fail())
KALDI_ERR << "Failed to read data.";
}
template<typename Real>
void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat,
MatrixTransposeType trans) const {
if (trans == kTrans) {
Matrix<Real> temp(this->NumCols(), this->NumRows());
CopyToMat(&temp, kNoTrans);
mat->CopyFromMat(temp, kTrans);
return;
}
if (data_ == NULL) {
KALDI_ASSERT(mat->NumRows() == 0);
KALDI_ASSERT(mat->NumCols() == 0);
return;
}
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
int32 num_cols = h->num_cols, num_rows = h->num_rows;
KALDI_ASSERT(mat->NumRows() == num_rows);
KALDI_ASSERT(mat->NumCols() == num_cols);
DataFormat format = static_cast<DataFormat>(h->format);
if (format == kOneByteWithColHeaders) {
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
h->num_cols);
for (int32 i = 0; i < num_cols; i++, per_col_header++) {
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
for (int32 j = 0; j < num_rows; j++, byte_data++) {
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
(*mat)(j, i) = f;
}
}
} else if (format == kTwoByte) {
const uint16 *data = reinterpret_cast<const uint16*>(h + 1);
float min_value = h->min_value,
increment = h->range * (1.0 / 65535.0);
for (int32 i = 0; i < num_rows; i++) {
Real *row_data = mat->RowData(i);
for (int32 j = 0; j < num_cols; j++)
row_data[j] = min_value + data[j] * increment;
data += num_cols;
}
} else {
KALDI_ASSERT(format == kOneByte);
float min_value = h->min_value, increment = h->range * (1.0 / 255.0);
const uint8 *data = reinterpret_cast<const uint8*>(h + 1);
for (int32 i = 0; i < num_rows; i++) {
Real *row_data = mat->RowData(i);
for (int32 j = 0; j < num_cols; j++)
row_data[j] = min_value + data[j] * increment;
data += num_cols;
}
}
}
// Instantiate the template for float and double.
template
void CompressedMatrix::CopyToMat(MatrixBase<float> *mat,
MatrixTransposeType trans) const;
template
void CompressedMatrix::CopyToMat(MatrixBase<double> *mat,
MatrixTransposeType trans) const;
template<typename Real>
void CompressedMatrix::CopyRowToVec(MatrixIndexT row,
VectorBase<Real> *v) const {
KALDI_ASSERT(row < this->NumRows());
KALDI_ASSERT(row >= 0);
KALDI_ASSERT(v->Dim() == this->NumCols());
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
DataFormat format = static_cast<DataFormat>(h->format);
if (format == kOneByteWithColHeaders) {
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
h->num_cols);
byte_data += row; // point to first value we are interested in
for (int32 i = 0; i < h->num_cols;
i++, per_col_header++, byte_data += h->num_rows) {
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
(*v)(i) = f;
}
} else if (format == kTwoByte) {
int32 num_cols = h->num_cols;
float min_value = h->min_value,
increment = h->range * (1.0 / 65535.0);
const uint16 *row_data = reinterpret_cast<uint16*>(h + 1) + (num_cols * row);
Real *v_data = v->Data();
for (int32 c = 0; c < num_cols; c++)
v_data[c] = min_value + row_data[c] * increment;
} else {
KALDI_ASSERT(format == kOneByte);
int32 num_cols = h->num_cols;
float min_value = h->min_value,
increment = h->range * (1.0 / 255.0);
const uint8 *row_data = reinterpret_cast<uint8*>(h + 1) + (num_cols * row);
Real *v_data = v->Data();
for (int32 c = 0; c < num_cols; c++)
v_data[c] = min_value + row_data[c] * increment;
}
}
template<typename Real>
void CompressedMatrix::CopyColToVec(MatrixIndexT col,
VectorBase<Real> *v) const {
KALDI_ASSERT(col < this->NumCols());
KALDI_ASSERT(col >= 0);
KALDI_ASSERT(v->Dim() == this->NumRows());
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
DataFormat format = static_cast<DataFormat>(h->format);
if (format == kOneByteWithColHeaders) {
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
h->num_cols);
byte_data += col*h->num_rows; // point to first value in the column we want
per_col_header += col;
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
for (int32 i = 0; i < h->num_rows; i++, byte_data++) {
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
(*v)(i) = f;
}
} else if (format == kTwoByte) {
int32 num_rows = h->num_rows, num_cols = h->num_cols;
float min_value = h->min_value,
increment = h->range * (1.0 / 65535.0);
const uint16 *col_data = reinterpret_cast<uint16*>(h + 1) + col;
Real *v_data = v->Data();
for (int32 r = 0; r < num_rows; r++)
v_data[r] = min_value + increment * col_data[r * num_cols];
} else {
KALDI_ASSERT(format == kOneByte);
int32 num_rows = h->num_rows, num_cols = h->num_cols;
float min_value = h->min_value,
increment = h->range * (1.0 / 255.0);
const uint8 *col_data = reinterpret_cast<uint8*>(h + 1) + col;
Real *v_data = v->Data();
for (int32 r = 0; r < num_rows; r++)
v_data[r] = min_value + increment * col_data[r * num_cols];
}
}
// instantiate the templates.
template void
CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase<double> *) const;
template void
CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase<float> *) const;
template void
CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase<double> *) const;
template void
CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase<float> *) const;
template<typename Real>
void CompressedMatrix::CopyToMat(int32 row_offset,
int32 col_offset,
MatrixBase<Real> *dest) const {
KALDI_PARANOID_ASSERT(row_offset < this->NumRows());
KALDI_PARANOID_ASSERT(col_offset < this->NumCols());
KALDI_PARANOID_ASSERT(row_offset >= 0);
KALDI_PARANOID_ASSERT(col_offset >= 0);
KALDI_ASSERT(row_offset+dest->NumRows() <= this->NumRows());
KALDI_ASSERT(col_offset+dest->NumCols() <= this->NumCols());
// everything is OK
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
int32 num_rows = h->num_rows, num_cols = h->num_cols,
tgt_cols = dest->NumCols(), tgt_rows = dest->NumRows();
DataFormat format = static_cast<DataFormat>(h->format);
if (format == kOneByteWithColHeaders) {
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
h->num_cols);
uint8 *start_of_subcol = byte_data+row_offset; // skip appropriate
// number of columns
start_of_subcol += col_offset*num_rows; // skip appropriate number of rows
per_col_header += col_offset; // skip the appropriate number of headers
for (int32 i = 0;
i < tgt_cols;
i++, per_col_header++, start_of_subcol+=num_rows) {
byte_data = start_of_subcol;
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
for (int32 j = 0; j < tgt_rows; j++, byte_data++) {
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
(*dest)(j, i) = f;
}
}
} else if (format == kTwoByte) {
const uint16 *data = reinterpret_cast<const uint16*>(h+1) + col_offset +
(num_cols * row_offset);
float min_value = h->min_value,
increment = h->range * (1.0 / 65535.0);
for (int32 row = 0; row < tgt_rows; row++) {
Real *dest_row = dest->RowData(row);
for (int32 col = 0; col < tgt_cols; col++)
dest_row[col] = min_value + increment * data[col];
data += num_cols;
}
} else {
KALDI_ASSERT(format == kOneByte);
const uint8 *data = reinterpret_cast<const uint8*>(h+1) + col_offset +
(num_cols * row_offset);
float min_value = h->min_value,
increment = h->range * (1.0 / 255.0);
for (int32 row = 0; row < tgt_rows; row++) {
Real *dest_row = dest->RowData(row);
for (int32 col = 0; col < tgt_cols; col++)
dest_row[col] = min_value + increment * data[col];
data += num_cols;
}
}
}
// instantiate the templates.
template void CompressedMatrix::CopyToMat(int32,
int32,
MatrixBase<float> *dest) const;
template void CompressedMatrix::CopyToMat(int32,
int32,
MatrixBase<double> *dest) const;
void CompressedMatrix::Clear() {
if (data_ != NULL) {
delete [] static_cast<float*>(data_);
data_ = NULL;
}
}
CompressedMatrix::CompressedMatrix(const CompressedMatrix &mat): data_(NULL) {
*this = mat; // use assignment operator.
}
CompressedMatrix &CompressedMatrix::operator = (const CompressedMatrix &mat) {
Clear(); // now this->data_ == NULL.
if (mat.data_ != NULL) {
MatrixIndexT data_size = DataSize(*static_cast<GlobalHeader*>(mat.data_));
data_ = AllocateData(data_size);
memcpy(static_cast<void*>(data_),
static_cast<void*>(mat.data_),
data_size);
}
return *this;
}
} // namespace kaldi

@ -1,283 +0,0 @@
// matrix/compressed-matrix.h
// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
// Frantisek Skala, Wei Shi
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_COMPRESSED_MATRIX_H_
#define KALDI_MATRIX_COMPRESSED_MATRIX_H_ 1
#include "matrix/kaldi-matrix.h"
namespace kaldi {
/// \addtogroup matrix_group
/// @{
/*
The enum CompressionMethod is used when creating a CompressedMatrix (a lossily
compressed matrix) from a regular Matrix. It dictates how we choose the
compressed format and how we choose the ranges of floats that are represented
by particular integers.
kAutomaticMethod = 1 This is the default when you don't specify the
compression method. It is a shorthand for using
kSpeechFeature if the num-rows is more than 8, and
kTwoByteAuto otherwise.
kSpeechFeature = 2 This is the most complicated of the compression methods,
and was designed for speech features which have a roughly
Gaussian distribution with different ranges for each
dimension. Each element is stored in one byte, but there
is an 8-byte header per column; the spacing of the
integer values is not uniform but is in 3 ranges.
kTwoByteAuto = 3 Each element is stored in two bytes as a uint16, with
the representable range of values chosen automatically
with the minimum and maximum elements of the matrix as
its edges.
kTwoByteSignedInteger = 4
Each element is stored in two bytes as a uint16, with
the representable range of value chosen to coincide with
what you'd get if you stored signed integers, i.e.
[-32768.0, 32767.0]. Suitable for waveform data that
was previously stored as 16-bit PCM.
kOneByteAuto = 5 Each element is stored in one byte as a uint8, with the
representable range of values chosen automatically with
the minimum and maximum elements of the matrix as its
edges.
kOneByteUnsignedInteger = 6 Each element is stored in
one byte as a uint8, with the representable range of
values equal to [0.0, 255.0].
kOneByteZeroOne = 7 Each element is stored in
one byte as a uint8, with the representable range of
values equal to [0.0, 1.0]. Suitable for image data
that has previously been compressed as int8.
// We can add new methods here as needed: if they just imply different ways
// of selecting the min_value and range, and a num-bytes = 1 or 2, they will
// be trivial to implement.
*/
enum CompressionMethod {
kAutomaticMethod = 1,
kSpeechFeature = 2,
kTwoByteAuto = 3,
kTwoByteSignedInteger = 4,
kOneByteAuto = 5,
kOneByteUnsignedInteger = 6,
kOneByteZeroOne = 7
};
/*
This class does lossy compression of a matrix. It supports various compression
methods, see enum CompressionMethod.
*/
class CompressedMatrix {
public:
CompressedMatrix(): data_(NULL) { }
~CompressedMatrix() { Clear(); }
template<typename Real>
explicit CompressedMatrix(const MatrixBase<Real> &mat,
CompressionMethod method = kAutomaticMethod):
data_(NULL) { CopyFromMat(mat, method); }
/// Initializer that can be used to select part of an existing
/// CompressedMatrix without un-compressing and re-compressing (note: unlike
/// similar initializers for class Matrix, it doesn't point to the same memory
/// location).
///
/// This creates a CompressedMatrix with the size (num_rows, num_cols)
/// starting at (row_offset, col_offset).
///
/// If you specify allow_padding = true,
/// it is permitted to have row_offset < 0 and
/// row_offset + num_rows > mat.NumRows(), and the result will contain
/// repeats of the first and last rows of 'mat' as necessary.
CompressedMatrix(const CompressedMatrix &mat,
const MatrixIndexT row_offset,
const MatrixIndexT num_rows,
const MatrixIndexT col_offset,
const MatrixIndexT num_cols,
bool allow_padding = false);
void *Data() const { return this->data_; }
/// This will resize *this and copy the contents of mat to *this.
template<typename Real>
void CopyFromMat(const MatrixBase<Real> &mat,
CompressionMethod method = kAutomaticMethod);
CompressedMatrix(const CompressedMatrix &mat);
CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
template<typename Real>
CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
/// Copies contents to matrix. Note: mat must have the correct size.
/// The kTrans case uses a temporary.
template<typename Real>
void CopyToMat(MatrixBase<Real> *mat,
MatrixTransposeType trans = kNoTrans) const;
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary);
/// Returns number of rows (or zero for emtpy matrix).
inline MatrixIndexT NumRows() const { return (data_ == NULL) ? 0 :
(*reinterpret_cast<GlobalHeader*>(data_)).num_rows; }
/// Returns number of columns (or zero for emtpy matrix).
inline MatrixIndexT NumCols() const { return (data_ == NULL) ? 0 :
(*reinterpret_cast<GlobalHeader*>(data_)).num_cols; }
/// Copies row #row of the matrix into vector v.
/// Note: v must have same size as #cols.
template<typename Real>
void CopyRowToVec(MatrixIndexT row, VectorBase<Real> *v) const;
/// Copies column #col of the matrix into vector v.
/// Note: v must have same size as #rows.
template<typename Real>
void CopyColToVec(MatrixIndexT col, VectorBase<Real> *v) const;
/// Copies submatrix of compressed matrix into matrix dest.
/// Submatrix starts at row row_offset and column column_offset and its size
/// is defined by size of provided matrix dest
template<typename Real>
void CopyToMat(int32 row_offset,
int32 column_offset,
MatrixBase<Real> *dest) const;
void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); }
void Clear();
/// scales all elements of matrix by alpha.
/// It scales the floating point values in GlobalHeader by alpha.
void Scale(float alpha);
friend class Matrix<float>;
friend class Matrix<double>;
private:
// This enum describes the different compressed-data formats: these are
// distinct from the compression methods although all of the methods apart
// from kAutomaticMethod dictate a particular compressed-data format.
//
// kOneByteWithColHeaders means there is a GlobalHeader and each
// column has a PerColHeader; the actual data is stored in
// one byte per element, in column-major order (the mapping
// from integers to floats is a little complicated).
// kTwoByte means there is a global header but no PerColHeader;
// the actual data is stored in two bytes per element in
// row-major order; it's decompressed as:
// uint16 i; GlobalHeader g;
// float f = g.min_value + i * (g.range / 65535.0)
// kOneByte means there is a global header but not PerColHeader;
// the data is stored in one byte per element in row-major
// order and is decompressed as:
// uint8 i; GlobalHeader g;
// float f = g.min_value + i * (g.range / 255.0)
enum DataFormat {
kOneByteWithColHeaders = 1,
kTwoByte = 2,
kOneByte = 3
};
// allocates data using new [], ensures byte alignment
// sufficient for float.
static void *AllocateData(int32 num_bytes);
struct GlobalHeader {
int32 format; // Represents the enum DataFormat.
float min_value; // min_value and range represent the ranges of the integer
// data in the kTwoByte and kOneByte formats, and the
// range of the PerColHeader uint16's in the
// kOneByteWithColheaders format.
float range;
int32 num_rows;
int32 num_cols;
};
// This function computes the global header for compressing this data.
template<typename Real>
static inline void ComputeGlobalHeader(const MatrixBase<Real> &mat,
CompressionMethod method,
GlobalHeader *header);
// The number of bytes we need to request when allocating 'data_'.
static MatrixIndexT DataSize(const GlobalHeader &header);
// This struct is only used in format kOneByteWithColHeaders.
struct PerColHeader {
uint16 percentile_0;
uint16 percentile_25;
uint16 percentile_75;
uint16 percentile_100;
};
template<typename Real>
static void CompressColumn(const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
int32 num_rows, PerColHeader *header,
uint8 *byte_data);
template<typename Real>
static void ComputeColHeader(const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
int32 num_rows, PerColHeader *header);
static inline uint16 FloatToUint16(const GlobalHeader &global_header,
float value);
// this is used only in the kOneByte compression format.
static inline uint8 FloatToUint8(const GlobalHeader &global_header,
float value);
static inline float Uint16ToFloat(const GlobalHeader &global_header,
uint16 value);
// this is used only in the kOneByteWithColHeaders compression format.
static inline uint8 FloatToChar(float p0, float p25,
float p75, float p100,
float value);
// this is used only in the kOneByteWithColHeaders compression format.
static inline float CharToFloat(float p0, float p25,
float p75, float p100,
uint8 value);
void *data_; // first GlobalHeader, then PerColHeader (repeated), then
// the byte data for each column (repeated). Note: don't intersperse
// the byte data with the PerColHeaders, because of alignment issues.
};
/// @} end of \addtogroup matrix_group
} // namespace kaldi
#endif // KALDI_MATRIX_COMPRESSED_MATRIX_H_

@ -1,924 +0,0 @@
// matrix/jama-eig.h
// Copyright 2009-2011 Microsoft Corporation
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
// This file consists of a port and modification of materials from
// JAMA: A Java Matrix Package
// under the following notice: This software is a cooperative product of
// The MathWorks and the National Institute of Standards and Technology (NIST)
// which has been released to the public. This notice and the original code are
// available at http://math.nist.gov/javanumerics/jama/domain.notice
#ifndef KALDI_MATRIX_JAMA_EIG_H_
#define KALDI_MATRIX_JAMA_EIG_H_ 1
#include "matrix/kaldi-matrix.h"
namespace kaldi {
// This class is not to be used externally. See the Eig function in the Matrix
// class in kaldi-matrix.h. This is the external interface.
template<typename Real> class EigenvalueDecomposition {
// This class is based on the EigenvalueDecomposition class from the JAMA
// library (version 1.0.2).
public:
EigenvalueDecomposition(const MatrixBase<Real> &A);
~EigenvalueDecomposition(); // free memory.
void GetV(MatrixBase<Real> *V_out) { // V is what we call P externally; it's the matrix of
// eigenvectors.
KALDI_ASSERT(V_out->NumRows() == static_cast<MatrixIndexT>(n_)
&& V_out->NumCols() == static_cast<MatrixIndexT>(n_));
for (int i = 0; i < n_; i++)
for (int j = 0; j < n_; j++)
(*V_out)(i, j) = V(i, j); // V(i, j) is member function.
}
void GetRealEigenvalues(VectorBase<Real> *r_out) {
// returns real part of eigenvalues.
KALDI_ASSERT(r_out->Dim() == static_cast<MatrixIndexT>(n_));
for (int i = 0; i < n_; i++)
(*r_out)(i) = d_[i];
}
void GetImagEigenvalues(VectorBase<Real> *i_out) {
// returns imaginary part of eigenvalues.
KALDI_ASSERT(i_out->Dim() == static_cast<MatrixIndexT>(n_));
for (int i = 0; i < n_; i++)
(*i_out)(i) = e_[i];
}
private:
inline Real &H(int r, int c) { return H_[r*n_ + c]; }
inline Real &V(int r, int c) { return V_[r*n_ + c]; }
// complex division
inline static void cdiv(Real xr, Real xi, Real yr, Real yi, Real *cdivr, Real *cdivi) {
Real r, d;
if (std::abs(yr) > std::abs(yi)) {
r = yi/yr;
d = yr + r*yi;
*cdivr = (xr + r*xi)/d;
*cdivi = (xi - r*xr)/d;
} else {
r = yr/yi;
d = yi + r*yr;
*cdivr = (r*xr + xi)/d;
*cdivi = (r*xi - xr)/d;
}
}
// Nonsymmetric reduction from Hessenberg to real Schur form.
void Hqr2 ();
int n_; // matrix dimension.
Real *d_, *e_; // real and imaginary parts of eigenvalues.
Real *V_; // the eigenvectors (P in our external notation)
Real *H_; // the nonsymmetric Hessenberg form.
Real *ort_; // working storage for nonsymmetric algorithm.
// Symmetric Householder reduction to tridiagonal form.
void Tred2 ();
// Symmetric tridiagonal QL algorithm.
void Tql2 ();
// Nonsymmetric reduction to Hessenberg form.
void Orthes ();
};
template class EigenvalueDecomposition<float>; // force instantiation.
template class EigenvalueDecomposition<double>; // force instantiation.
template<typename Real> void EigenvalueDecomposition<Real>::Tred2() {
// This is derived from the Algol procedures tred2 by
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
// Fortran subroutine in EISPACK.
for (int j = 0; j < n_; j++) {
d_[j] = V(n_-1, j);
}
// Householder reduction to tridiagonal form.
for (int i = n_-1; i > 0; i--) {
// Scale to avoid under/overflow.
Real scale = 0.0;
Real h = 0.0;
for (int k = 0; k < i; k++) {
scale = scale + std::abs(d_[k]);
}
if (scale == 0.0) {
e_[i] = d_[i-1];
for (int j = 0; j < i; j++) {
d_[j] = V(i-1, j);
V(i, j) = 0.0;
V(j, i) = 0.0;
}
} else {
// Generate Householder vector.
for (int k = 0; k < i; k++) {
d_[k] /= scale;
h += d_[k] * d_[k];
}
Real f = d_[i-1];
Real g = std::sqrt(h);
if (f > 0) {
g = -g;
}
e_[i] = scale * g;
h = h - f * g;
d_[i-1] = f - g;
for (int j = 0; j < i; j++) {
e_[j] = 0.0;
}
// Apply similarity transformation to remaining columns.
for (int j = 0; j < i; j++) {
f = d_[j];
V(j, i) = f;
g =e_[j] + V(j, j) * f;
for (int k = j+1; k <= i-1; k++) {
g += V(k, j) * d_[k];
e_[k] += V(k, j) * f;
}
e_[j] = g;
}
f = 0.0;
for (int j = 0; j < i; j++) {
e_[j] /= h;
f += e_[j] * d_[j];
}
Real hh = f / (h + h);
for (int j = 0; j < i; j++) {
e_[j] -= hh * d_[j];
}
for (int j = 0; j < i; j++) {
f = d_[j];
g = e_[j];
for (int k = j; k <= i-1; k++) {
V(k, j) -= (f * e_[k] + g * d_[k]);
}
d_[j] = V(i-1, j);
V(i, j) = 0.0;
}
}
d_[i] = h;
}
// Accumulate transformations.
for (int i = 0; i < n_-1; i++) {
V(n_-1, i) = V(i, i);
V(i, i) = 1.0;
Real h = d_[i+1];
if (h != 0.0) {
for (int k = 0; k <= i; k++) {
d_[k] = V(k, i+1) / h;
}
for (int j = 0; j <= i; j++) {
Real g = 0.0;
for (int k = 0; k <= i; k++) {
g += V(k, i+1) * V(k, j);
}
for (int k = 0; k <= i; k++) {
V(k, j) -= g * d_[k];
}
}
}
for (int k = 0; k <= i; k++) {
V(k, i+1) = 0.0;
}
}
for (int j = 0; j < n_; j++) {
d_[j] = V(n_-1, j);
V(n_-1, j) = 0.0;
}
V(n_-1, n_-1) = 1.0;
e_[0] = 0.0;
}
template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
// This is derived from the Algol procedures tql2, by
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
// Fortran subroutine in EISPACK.
for (int i = 1; i < n_; i++) {
e_[i-1] = e_[i];
}
e_[n_-1] = 0.0;
Real f = 0.0;
Real tst1 = 0.0;
Real eps = std::numeric_limits<Real>::epsilon();
for (int l = 0; l < n_; l++) {
// Find small subdiagonal element
tst1 = std::max(tst1, std::abs(d_[l]) + std::abs(e_[l]));
int m = l;
while (m < n_) {
if (std::abs(e_[m]) <= eps*tst1) {
break;
}
m++;
}
// If m == l, d_[l] is an eigenvalue,
// otherwise, iterate.
if (m > l) {
int iter = 0;
do {
iter = iter + 1; // (Could check iteration count here.)
// Compute implicit shift
Real g = d_[l];
Real p = (d_[l+1] - g) / (2.0 *e_[l]);
Real r = Hypot(p, static_cast<Real>(1.0)); // This is a Kaldi version of hypot that works with templates.
if (p < 0) {
r = -r;
}
d_[l] =e_[l] / (p + r);
d_[l+1] =e_[l] * (p + r);
Real dl1 = d_[l+1];
Real h = g - d_[l];
for (int i = l+2; i < n_; i++) {
d_[i] -= h;
}
f = f + h;
// Implicit QL transformation.
p = d_[m];
Real c = 1.0;
Real c2 = c;
Real c3 = c;
Real el1 =e_[l+1];
Real s = 0.0;
Real s2 = 0.0;
for (int i = m-1; i >= l; i--) {
c3 = c2;
c2 = c;
s2 = s;
g = c *e_[i];
h = c * p;
r = Hypot(p, e_[i]); // This is a Kaldi version of Hypot that works with templates.
e_[i+1] = s * r;
s =e_[i] / r;
c = p / r;
p = c * d_[i] - s * g;
d_[i+1] = h + s * (c * g + s * d_[i]);
// Accumulate transformation.
for (int k = 0; k < n_; k++) {
h = V(k, i+1);
V(k, i+1) = s * V(k, i) + c * h;
V(k, i) = c * V(k, i) - s * h;
}
}
p = -s * s2 * c3 * el1 *e_[l] / dl1;
e_[l] = s * p;
d_[l] = c * p;
// Check for convergence.
} while (std::abs(e_[l]) > eps*tst1);
}
d_[l] = d_[l] + f;
e_[l] = 0.0;
}
// Sort eigenvalues and corresponding vectors.
for (int i = 0; i < n_-1; i++) {
int k = i;
Real p = d_[i];
for (int j = i+1; j < n_; j++) {
if (d_[j] < p) {
k = j;
p = d_[j];
}
}
if (k != i) {
d_[k] = d_[i];
d_[i] = p;
for (int j = 0; j < n_; j++) {
p = V(j, i);
V(j, i) = V(j, k);
V(j, k) = p;
}
}
}
}
template<typename Real>
void EigenvalueDecomposition<Real>::Orthes() {
// This is derived from the Algol procedures orthes and ortran,
// by Martin and Wilkinson, Handbook for Auto. Comp.,
// Vol.ii-Linear Algebra, and the corresponding
// Fortran subroutines in EISPACK.
int low = 0;
int high = n_-1;
for (int m = low+1; m <= high-1; m++) {
// Scale column.
Real scale = 0.0;
for (int i = m; i <= high; i++) {
scale = scale + std::abs(H(i, m-1));
}
if (scale != 0.0) {
// Compute Householder transformation.
Real h = 0.0;
for (int i = high; i >= m; i--) {
ort_[i] = H(i, m-1)/scale;
h += ort_[i] * ort_[i];
}
Real g = std::sqrt(h);
if (ort_[m] > 0) {
g = -g;
}
h = h - ort_[m] * g;
ort_[m] = ort_[m] - g;
// Apply Householder similarity transformation
// H = (I-u*u'/h)*H*(I-u*u')/h)
for (int j = m; j < n_; j++) {
Real f = 0.0;
for (int i = high; i >= m; i--) {
f += ort_[i]*H(i, j);
}
f = f/h;
for (int i = m; i <= high; i++) {
H(i, j) -= f*ort_[i];
}
}
for (int i = 0; i <= high; i++) {
Real f = 0.0;
for (int j = high; j >= m; j--) {
f += ort_[j]*H(i, j);
}
f = f/h;
for (int j = m; j <= high; j++) {
H(i, j) -= f*ort_[j];
}
}
ort_[m] = scale*ort_[m];
H(m, m-1) = scale*g;
}
}
// Accumulate transformations (Algol's ortran).
for (int i = 0; i < n_; i++) {
for (int j = 0; j < n_; j++) {
V(i, j) = (i == j ? 1.0 : 0.0);
}
}
for (int m = high-1; m >= low+1; m--) {
if (H(m, m-1) != 0.0) {
for (int i = m+1; i <= high; i++) {
ort_[i] = H(i, m-1);
}
for (int j = m; j <= high; j++) {
Real g = 0.0;
for (int i = m; i <= high; i++) {
g += ort_[i] * V(i, j);
}
// Double division avoids possible underflow
g = (g / ort_[m]) / H(m, m-1);
for (int i = m; i <= high; i++) {
V(i, j) += g * ort_[i];
}
}
}
}
}
template<typename Real> void EigenvalueDecomposition<Real>::Hqr2() {
// This is derived from the Algol procedure hqr2,
// by Martin and Wilkinson, Handbook for Auto. Comp.,
// Vol.ii-Linear Algebra, and the corresponding
// Fortran subroutine in EISPACK.
int nn = n_;
int n = nn-1;
int low = 0;
int high = nn-1;
Real eps = std::numeric_limits<Real>::epsilon();
Real exshift = 0.0;
Real p = 0, q = 0, r = 0, s = 0, z=0, t, w, x, y;
// Store roots isolated by balanc and compute matrix norm
Real norm = 0.0;
for (int i = 0; i < nn; i++) {
if (i < low || i > high) {
d_[i] = H(i, i);
e_[i] = 0.0;
}
for (int j = std::max(i-1, 0); j < nn; j++) {
norm = norm + std::abs(H(i, j));
}
}
// Outer loop over eigenvalue index
int iter = 0;
while (n >= low) {
// Look for single small sub-diagonal element
int l = n;
while (l > low) {
s = std::abs(H(l-1, l-1)) + std::abs(H(l, l));
if (s == 0.0) {
s = norm;
}
if (std::abs(H(l, l-1)) < eps * s) {
break;
}
l--;
}
// Check for convergence
// One root found
if (l == n) {
H(n, n) = H(n, n) + exshift;
d_[n] = H(n, n);
e_[n] = 0.0;
n--;
iter = 0;
// Two roots found
} else if (l == n-1) {
w = H(n, n-1) * H(n-1, n);
p = (H(n-1, n-1) - H(n, n)) / 2.0;
q = p * p + w;
z = std::sqrt(std::abs(q));
H(n, n) = H(n, n) + exshift;
H(n-1, n-1) = H(n-1, n-1) + exshift;
x = H(n, n);
// Real pair
if (q >= 0) {
if (p >= 0) {
z = p + z;
} else {
z = p - z;
}
d_[n-1] = x + z;
d_[n] = d_[n-1];
if (z != 0.0) {
d_[n] = x - w / z;
}
e_[n-1] = 0.0;
e_[n] = 0.0;
x = H(n, n-1);
s = std::abs(x) + std::abs(z);
p = x / s;
q = z / s;
r = std::sqrt(p * p+q * q);
p = p / r;
q = q / r;
// Row modification
for (int j = n-1; j < nn; j++) {
z = H(n-1, j);
H(n-1, j) = q * z + p * H(n, j);
H(n, j) = q * H(n, j) - p * z;
}
// Column modification
for (int i = 0; i <= n; i++) {
z = H(i, n-1);
H(i, n-1) = q * z + p * H(i, n);
H(i, n) = q * H(i, n) - p * z;
}
// Accumulate transformations
for (int i = low; i <= high; i++) {
z = V(i, n-1);
V(i, n-1) = q * z + p * V(i, n);
V(i, n) = q * V(i, n) - p * z;
}
// Complex pair
} else {
d_[n-1] = x + p;
d_[n] = x + p;
e_[n-1] = z;
e_[n] = -z;
}
n = n - 2;
iter = 0;
// No convergence yet
} else {
// Form shift
x = H(n, n);
y = 0.0;
w = 0.0;
if (l < n) {
y = H(n-1, n-1);
w = H(n, n-1) * H(n-1, n);
}
// Wilkinson's original ad hoc shift
if (iter == 10) {
exshift += x;
for (int i = low; i <= n; i++) {
H(i, i) -= x;
}
s = std::abs(H(n, n-1)) + std::abs(H(n-1, n-2));
x = y = 0.75 * s;
w = -0.4375 * s * s;
}
// MATLAB's new ad hoc shift
if (iter == 30) {
s = (y - x) / 2.0;
s = s * s + w;
if (s > 0) {
s = std::sqrt(s);
if (y < x) {
s = -s;
}
s = x - w / ((y - x) / 2.0 + s);
for (int i = low; i <= n; i++) {
H(i, i) -= s;
}
exshift += s;
x = y = w = 0.964;
}
}
iter = iter + 1; // (Could check iteration count here.)
// Look for two consecutive small sub-diagonal elements
int m = n-2;
while (m >= l) {
z = H(m, m);
r = x - z;
s = y - z;
p = (r * s - w) / H(m+1, m) + H(m, m+1);
q = H(m+1, m+1) - z - r - s;
r = H(m+2, m+1);
s = std::abs(p) + std::abs(q) + std::abs(r);
p = p / s;
q = q / s;
r = r / s;
if (m == l) {
break;
}
if (std::abs(H(m, m-1)) * (std::abs(q) + std::abs(r)) <
eps * (std::abs(p) * (std::abs(H(m-1, m-1)) + std::abs(z) +
std::abs(H(m+1, m+1))))) {
break;
}
m--;
}
for (int i = m+2; i <= n; i++) {
H(i, i-2) = 0.0;
if (i > m+2) {
H(i, i-3) = 0.0;
}
}
// Double QR step involving rows l:n and columns m:n
for (int k = m; k <= n-1; k++) {
bool notlast = (k != n-1);
if (k != m) {
p = H(k, k-1);
q = H(k+1, k-1);
r = (notlast ? H(k+2, k-1) : 0.0);
x = std::abs(p) + std::abs(q) + std::abs(r);
if (x != 0.0) {
p = p / x;
q = q / x;
r = r / x;
}
}
if (x == 0.0) {
break;
}
s = std::sqrt(p * p + q * q + r * r);
if (p < 0) {
s = -s;
}
if (s != 0) {
if (k != m) {
H(k, k-1) = -s * x;
} else if (l != m) {
H(k, k-1) = -H(k, k-1);
}
p = p + s;
x = p / s;
y = q / s;
z = r / s;
q = q / p;
r = r / p;
// Row modification
for (int j = k; j < nn; j++) {
p = H(k, j) + q * H(k+1, j);
if (notlast) {
p = p + r * H(k+2, j);
H(k+2, j) = H(k+2, j) - p * z;
}
H(k, j) = H(k, j) - p * x;
H(k+1, j) = H(k+1, j) - p * y;
}
// Column modification
for (int i = 0; i <= std::min(n, k+3); i++) {
p = x * H(i, k) + y * H(i, k+1);
if (notlast) {
p = p + z * H(i, k+2);
H(i, k+2) = H(i, k+2) - p * r;
}
H(i, k) = H(i, k) - p;
H(i, k+1) = H(i, k+1) - p * q;
}
// Accumulate transformations
for (int i = low; i <= high; i++) {
p = x * V(i, k) + y * V(i, k+1);
if (notlast) {
p = p + z * V(i, k+2);
V(i, k+2) = V(i, k+2) - p * r;
}
V(i, k) = V(i, k) - p;
V(i, k+1) = V(i, k+1) - p * q;
}
} // (s != 0)
} // k loop
} // check convergence
} // while (n >= low)
// Backsubstitute to find vectors of upper triangular form
if (norm == 0.0) {
return;
}
for (n = nn-1; n >= 0; n--) {
p = d_[n];
q = e_[n];
// Real vector
if (q == 0) {
int l = n;
H(n, n) = 1.0;
for (int i = n-1; i >= 0; i--) {
w = H(i, i) - p;
r = 0.0;
for (int j = l; j <= n; j++) {
r = r + H(i, j) * H(j, n);
}
if (e_[i] < 0.0) {
z = w;
s = r;
} else {
l = i;
if (e_[i] == 0.0) {
if (w != 0.0) {
H(i, n) = -r / w;
} else {
H(i, n) = -r / (eps * norm);
}
// Solve real equations
} else {
x = H(i, i+1);
y = H(i+1, i);
q = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i];
t = (x * s - z * r) / q;
H(i, n) = t;
if (std::abs(x) > std::abs(z)) {
H(i+1, n) = (-r - w * t) / x;
} else {
H(i+1, n) = (-s - y * t) / z;
}
}
// Overflow control
t = std::abs(H(i, n));
if ((eps * t) * t > 1) {
for (int j = i; j <= n; j++) {
H(j, n) = H(j, n) / t;
}
}
}
}
// Complex vector
} else if (q < 0) {
int l = n-1;
// Last vector component imaginary so matrix is triangular
if (std::abs(H(n, n-1)) > std::abs(H(n-1, n))) {
H(n-1, n-1) = q / H(n, n-1);
H(n-1, n) = -(H(n, n) - p) / H(n, n-1);
} else {
Real cdivr, cdivi;
cdiv(0.0, -H(n-1, n), H(n-1, n-1)-p, q, &cdivr, &cdivi);
H(n-1, n-1) = cdivr;
H(n-1, n) = cdivi;
}
H(n, n-1) = 0.0;
H(n, n) = 1.0;
for (int i = n-2; i >= 0; i--) {
Real ra, sa, vr, vi;
ra = 0.0;
sa = 0.0;
for (int j = l; j <= n; j++) {
ra = ra + H(i, j) * H(j, n-1);
sa = sa + H(i, j) * H(j, n);
}
w = H(i, i) - p;
if (e_[i] < 0.0) {
z = w;
r = ra;
s = sa;
} else {
l = i;
if (e_[i] == 0) {
Real cdivr, cdivi;
cdiv(-ra, -sa, w, q, &cdivr, &cdivi);
H(i, n-1) = cdivr;
H(i, n) = cdivi;
} else {
Real cdivr, cdivi;
// Solve complex equations
x = H(i, i+1);
y = H(i+1, i);
vr = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i] - q * q;
vi = (d_[i] - p) * 2.0 * q;
if (vr == 0.0 && vi == 0.0) {
vr = eps * norm * (std::abs(w) + std::abs(q) +
std::abs(x) + std::abs(y) + std::abs(z));
}
cdiv(x*r-z*ra+q*sa, x*s-z*sa-q*ra, vr, vi, &cdivr, &cdivi);
H(i, n-1) = cdivr;
H(i, n) = cdivi;
if (std::abs(x) > (std::abs(z) + std::abs(q))) {
H(i+1, n-1) = (-ra - w * H(i, n-1) + q * H(i, n)) / x;
H(i+1, n) = (-sa - w * H(i, n) - q * H(i, n-1)) / x;
} else {
cdiv(-r-y*H(i, n-1), -s-y*H(i, n), z, q, &cdivr, &cdivi);
H(i+1, n-1) = cdivr;
H(i+1, n) = cdivi;
}
}
// Overflow control
t = std::max(std::abs(H(i, n-1)), std::abs(H(i, n)));
if ((eps * t) * t > 1) {
for (int j = i; j <= n; j++) {
H(j, n-1) = H(j, n-1) / t;
H(j, n) = H(j, n) / t;
}
}
}
}
}
}
// Vectors of isolated roots
for (int i = 0; i < nn; i++) {
if (i < low || i > high) {
for (int j = i; j < nn; j++) {
V(i, j) = H(i, j);
}
}
}
// Back transformation to get eigenvectors of original matrix
for (int j = nn-1; j >= low; j--) {
for (int i = low; i <= high; i++) {
z = 0.0;
for (int k = low; k <= std::min(j, high); k++) {
z = z + V(i, k) * H(k, j);
}
V(i, j) = z;
}
}
}
template<typename Real>
EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
n_ = A.NumRows();
V_ = new Real[n_*n_];
d_ = new Real[n_];
e_ = new Real[n_];
H_ = NULL;
ort_ = NULL;
if (A.IsSymmetric(0.0)) {
for (int i = 0; i < n_; i++)
for (int j = 0; j < n_; j++)
V(i, j) = A(i, j); // Note that V(i, j) is a member function; A(i, j) is an operator
// of the matrix A.
// Tridiagonalize.
Tred2();
// Diagonalize.
Tql2();
} else {
H_ = new Real[n_*n_];
ort_ = new Real[n_];
for (int i = 0; i < n_; i++)
for (int j = 0; j < n_; j++)
H(i, j) = A(i, j); // as before: H is member function, A(i, j) is operator of matrix.
// Reduce to Hessenberg form.
Orthes();
// Reduce Hessenberg to real Schur form.
Hqr2();
}
}
template<typename Real>
EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
delete [] d_;
delete [] e_;
delete [] V_;
delete [] H_;
delete [] ort_;
}
// see function MatrixBase<Real>::Eig in kaldi-matrix.cc
} // namespace kaldi
#endif // KALDI_MATRIX_JAMA_EIG_H_

@ -1,531 +0,0 @@
// matrix/jama-svd.h
// Copyright 2009-2011 Microsoft Corporation
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
// This file consists of a port and modification of materials from
// JAMA: A Java Matrix Package
// under the following notice: This software is a cooperative product of
// The MathWorks and the National Institute of Standards and Technology (NIST)
// which has been released to the public. This notice and the original code are
// available at http://math.nist.gov/javanumerics/jama/domain.notice
#ifndef KALDI_MATRIX_JAMA_SVD_H_
#define KALDI_MATRIX_JAMA_SVD_H_ 1
#include "matrix/kaldi-matrix.h"
#include "matrix/sp-matrix.h"
#include "matrix/cblas-wrappers.h"
namespace kaldi {
#if defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
// using ATLAS as our math library, which doesn't have SVD -> need
// to implement it.
// This routine is a modified form of jama_svd.h which is part of the TNT distribution.
// (originally comes from JAMA).
/** Singular Value Decomposition.
* <P>
* For an m-by-n matrix A with m >= n, the singular value decomposition is
* an m-by-n orthogonal matrix U, an n-by-n diagonal matrix S, and
* an n-by-n orthogonal matrix V so that A = U*S*V'.
* <P>
* The singular values, sigma[k] = S(k, k), are ordered so that
* sigma[0] >= sigma[1] >= ... >= sigma[n-1].
* <P>
* The singular value decompostion always exists, so the constructor will
* never fail. The matrix condition number and the effective numerical
* rank can be computed from this decomposition.
* <p>
* (Adapted from JAMA, a Java Matrix Library, developed by jointly
* by the Mathworks and NIST; see http://math.nist.gov/javanumerics/jama).
*/
template<typename Real>
bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
MatrixBase<Real> *U_in,
MatrixBase<Real> *V_in) { // Destructive!
KALDI_ASSERT(s_in != NULL && U_in != this && V_in != this);
int wantu = (U_in != NULL), wantv = (V_in != NULL);
Matrix<Real> Utmp, Vtmp;
MatrixBase<Real> &U = (U_in ? *U_in : Utmp), &V = (V_in ? *V_in : Vtmp);
VectorBase<Real> &s = *s_in;
int m = num_rows_, n = num_cols_;
KALDI_ASSERT(m>=n && m != 0 && n != 0);
if (wantu) KALDI_ASSERT((int)U.num_rows_ == m && (int)U.num_cols_ == n);
if (wantv) KALDI_ASSERT((int)V.num_rows_ == n && (int)V.num_cols_ == n);
KALDI_ASSERT((int)s.Dim() == n); // n<=m so n is min.
int nu = n;
U.SetZero(); // make sure all zero.
Vector<Real> e(n);
Vector<Real> work(m);
MatrixBase<Real> &A(*this);
Real *adata = A.Data(), *workdata = work.Data(), *edata = e.Data(),
*udata = U.Data(), *vdata = V.Data();
int astride = static_cast<int>(A.Stride()),
ustride = static_cast<int>(U.Stride()),
vstride = static_cast<int>(V.Stride());
int i = 0, j = 0, k = 0;
// Reduce A to bidiagonal form, storing the diagonal elements
// in s and the super-diagonal elements in e.
int nct = std::min(m-1, n);
int nrt = std::max(0, std::min(n-2, m));
for (k = 0; k < std::max(nct, nrt); k++) {
if (k < nct) {
// Compute the transformation for the k-th column and
// place the k-th diagonal in s(k).
// Compute 2-norm of k-th column without under/overflow.
s(k) = 0;
for (i = k; i < m; i++) {
s(k) = hypot(s(k), A(i, k));
}
if (s(k) != 0.0) {
if (A(k, k) < 0.0) {
s(k) = -s(k);
}
for (i = k; i < m; i++) {
A(i, k) /= s(k);
}
A(k, k) += 1.0;
}
s(k) = -s(k);
}
for (j = k+1; j < n; j++) {
if ((k < nct) && (s(k) != 0.0)) {
// Apply the transformation.
Real t = cblas_Xdot(m - k, adata + astride*k + k, astride,
adata + astride*k + j, astride);
/*for (i = k; i < m; i++) {
t += adata[i*astride + k]*adata[i*astride + j]; // A(i, k)*A(i, j); // 3
}*/
t = -t/A(k, k);
cblas_Xaxpy(m - k, t, adata + k*astride + k, astride,
adata + k*astride + j, astride);
/*for (i = k; i < m; i++) {
adata[i*astride + j] += t*adata[i*astride + k]; // A(i, j) += t*A(i, k); // 5
}*/
}
// Place the k-th row of A into e for the
// subsequent calculation of the row transformation.
e(j) = A(k, j);
}
if (wantu & (k < nct)) {
// Place the transformation in U for subsequent back
// multiplication.
for (i = k; i < m; i++) {
U(i, k) = A(i, k);
}
}
if (k < nrt) {
// Compute the k-th row transformation and place the
// k-th super-diagonal in e(k).
// Compute 2-norm without under/overflow.
e(k) = 0;
for (i = k+1; i < n; i++) {
e(k) = hypot(e(k), e(i));
}
if (e(k) != 0.0) {
if (e(k+1) < 0.0) {
e(k) = -e(k);
}
for (i = k+1; i < n; i++) {
e(i) /= e(k);
}
e(k+1) += 1.0;
}
e(k) = -e(k);
if ((k+1 < m) & (e(k) != 0.0)) {
// Apply the transformation.
for (i = k+1; i < m; i++) {
work(i) = 0.0;
}
for (j = k+1; j < n; j++) {
for (i = k+1; i < m; i++) {
workdata[i] += edata[j] * adata[i*astride + j]; // work(i) += e(j)*A(i, j); // 5
}
}
for (j = k+1; j < n; j++) {
Real t(-e(j)/e(k+1));
cblas_Xaxpy(m - (k+1), t, workdata + (k+1), 1,
adata + (k+1)*astride + j, astride);
/*
for (i = k+1; i < m; i++) {
adata[i*astride + j] += t*workdata[i]; // A(i, j) += t*work(i); // 5
}*/
}
}
if (wantv) {
// Place the transformation in V for subsequent
// back multiplication.
for (i = k+1; i < n; i++) {
V(i, k) = e(i);
}
}
}
}
// Set up the final bidiagonal matrix or order p.
int p = std::min(n, m+1);
if (nct < n) {
s(nct) = A(nct, nct);
}
if (m < p) {
s(p-1) = 0.0;
}
if (nrt+1 < p) {
e(nrt) = A(nrt, p-1);
}
e(p-1) = 0.0;
// If required, generate U.
if (wantu) {
for (j = nct; j < nu; j++) {
for (i = 0; i < m; i++) {
U(i, j) = 0.0;
}
U(j, j) = 1.0;
}
for (k = nct-1; k >= 0; k--) {
if (s(k) != 0.0) {
for (j = k+1; j < nu; j++) {
Real t = cblas_Xdot(m - k, udata + k*ustride + k, ustride, udata + k*ustride + j, ustride);
//for (i = k; i < m; i++) {
// t += udata[i*ustride + k]*udata[i*ustride + j]; // t += U(i, k)*U(i, j); // 8
// }
t = -t/U(k, k);
cblas_Xaxpy(m - k, t, udata + ustride*k + k, ustride,
udata + k*ustride + j, ustride);
/*for (i = k; i < m; i++) {
udata[i*ustride + j] += t*udata[i*ustride + k]; // U(i, j) += t*U(i, k); // 4
}*/
}
for (i = k; i < m; i++ ) {
U(i, k) = -U(i, k);
}
U(k, k) = 1.0 + U(k, k);
for (i = 0; i < k-1; i++) {
U(i, k) = 0.0;
}
} else {
for (i = 0; i < m; i++) {
U(i, k) = 0.0;
}
U(k, k) = 1.0;
}
}
}
// If required, generate V.
if (wantv) {
for (k = n-1; k >= 0; k--) {
if ((k < nrt) & (e(k) != 0.0)) {
for (j = k+1; j < nu; j++) {
Real t = cblas_Xdot(n - (k+1), vdata + (k+1)*vstride + k, vstride,
vdata + (k+1)*vstride + j, vstride);
/*Real t (0.0);
for (i = k+1; i < n; i++) {
t += vdata[i*vstride + k]*vdata[i*vstride + j]; // t += V(i, k)*V(i, j); // 7
}*/
t = -t/V(k+1, k);
cblas_Xaxpy(n - (k+1), t, vdata + (k+1)*vstride + k, vstride,
vdata + (k+1)*vstride + j, vstride);
/*for (i = k+1; i < n; i++) {
vdata[i*vstride + j] += t*vdata[i*vstride + k]; // V(i, j) += t*V(i, k); // 7
}*/
}
}
for (i = 0; i < n; i++) {
V(i, k) = 0.0;
}
V(k, k) = 1.0;
}
}
// Main iteration loop for the singular values.
int pp = p-1;
int iter = 0;
// note: -52.0 is from Jama code; the -23 is the extension
// to float, because mantissa length in (double, float)
// is (52, 23) bits respectively.
Real eps(pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
// Note: the -966 was taken from Jama code, but the -120 is a guess
// of how to extend this to float... the exponent in double goes
// from -1022 .. 1023, and in float from -126..127. I'm not sure
// what the significance of 966 is, so -120 just represents a number
// that's a bit less negative than -126. If we get convergence
// failure in float only, this may mean that we have to make the
// -120 value less negative.
Real tiny(pow(2.0, sizeof(Real) == 4 ? -120.0: -966.0 ));
while (p > 0) {
int k = 0;
int kase = 0;
if (iter == 500 || iter == 750) {
KALDI_WARN << "Svd taking a long time: making convergence criterion less exact.";
eps = pow(static_cast<Real>(0.8), eps);
tiny = pow(static_cast<Real>(0.8), tiny);
}
if (iter > 1000) {
KALDI_WARN << "Svd not converging on matrix of size " << m << " by " <<n;
return false;
}
// This section of the program inspects for
// negligible elements in the s and e arrays. On
// completion the variables kase and k are set as follows.
// kase = 1 if s(p) and e(k-1) are negligible and k < p
// kase = 2 if s(k) is negligible and k < p
// kase = 3 if e(k-1) is negligible, k < p, and
// s(k), ..., s(p) are not negligible (qr step).
// kase = 4 if e(p-1) is negligible (convergence).
for (k = p-2; k >= -1; k--) {
if (k == -1) {
break;
}
if (std::abs(e(k)) <=
tiny + eps*(std::abs(s(k)) + std::abs(s(k+1)))) {
e(k) = 0.0;
break;
}
}
if (k == p-2) {
kase = 4;
} else {
int ks;
for (ks = p-1; ks >= k; ks--) {
if (ks == k) {
break;
}
Real t( (ks != p ? std::abs(e(ks)) : 0.) +
(ks != k+1 ? std::abs(e(ks-1)) : 0.));
if (std::abs(s(ks)) <= tiny + eps*t) {
s(ks) = 0.0;
break;
}
}
if (ks == k) {
kase = 3;
} else if (ks == p-1) {
kase = 1;
} else {
kase = 2;
k = ks;
}
}
k++;
// Perform the task indicated by kase.
switch (kase) {
// Deflate negligible s(p).
case 1: {
Real f(e(p-2));
e(p-2) = 0.0;
for (j = p-2; j >= k; j--) {
Real t( hypot(s(j), f));
Real cs(s(j)/t);
Real sn(f/t);
s(j) = t;
if (j != k) {
f = -sn*e(j-1);
e(j-1) = cs*e(j-1);
}
if (wantv) {
for (i = 0; i < n; i++) {
t = cs*V(i, j) + sn*V(i, p-1);
V(i, p-1) = -sn*V(i, j) + cs*V(i, p-1);
V(i, j) = t;
}
}
}
}
break;
// Split at negligible s(k).
case 2: {
Real f(e(k-1));
e(k-1) = 0.0;
for (j = k; j < p; j++) {
Real t(hypot(s(j), f));
Real cs( s(j)/t);
Real sn(f/t);
s(j) = t;
f = -sn*e(j);
e(j) = cs*e(j);
if (wantu) {
for (i = 0; i < m; i++) {
t = cs*U(i, j) + sn*U(i, k-1);
U(i, k-1) = -sn*U(i, j) + cs*U(i, k-1);
U(i, j) = t;
}
}
}
}
break;
// Perform one qr step.
case 3: {
// Calculate the shift.
Real scale = std::max(std::max(std::max(std::max(
std::abs(s(p-1)), std::abs(s(p-2))), std::abs(e(p-2))),
std::abs(s(k))), std::abs(e(k)));
Real sp = s(p-1)/scale;
Real spm1 = s(p-2)/scale;
Real epm1 = e(p-2)/scale;
Real sk = s(k)/scale;
Real ek = e(k)/scale;
Real b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/2.0;
Real c = (sp*epm1)*(sp*epm1);
Real shift = 0.0;
if ((b != 0.0) || (c != 0.0)) {
shift = std::sqrt(b*b + c);
if (b < 0.0) {
shift = -shift;
}
shift = c/(b + shift);
}
Real f = (sk + sp)*(sk - sp) + shift;
Real g = sk*ek;
// Chase zeros.
for (j = k; j < p-1; j++) {
Real t = hypot(f, g);
Real cs = f/t;
Real sn = g/t;
if (j != k) {
e(j-1) = t;
}
f = cs*s(j) + sn*e(j);
e(j) = cs*e(j) - sn*s(j);
g = sn*s(j+1);
s(j+1) = cs*s(j+1);
if (wantv) {
cblas_Xrot(n, vdata + j, vstride, vdata + j+1, vstride, cs, sn);
/*for (i = 0; i < n; i++) {
t = cs*vdata[i*vstride + j] + sn*vdata[i*vstride + j+1]; // t = cs*V(i, j) + sn*V(i, j+1); // 13
vdata[i*vstride + j+1] = -sn*vdata[i*vstride + j] + cs*vdata[i*vstride + j+1]; // V(i, j+1) = -sn*V(i, j) + cs*V(i, j+1); // 5
vdata[i*vstride + j] = t; // V(i, j) = t; // 4
}*/
}
t = hypot(f, g);
cs = f/t;
sn = g/t;
s(j) = t;
f = cs*e(j) + sn*s(j+1);
s(j+1) = -sn*e(j) + cs*s(j+1);
g = sn*e(j+1);
e(j+1) = cs*e(j+1);
if (wantu && (j < m-1)) {
cblas_Xrot(m, udata + j, ustride, udata + j+1, ustride, cs, sn);
/*for (i = 0; i < m; i++) {
t = cs*udata[i*ustride + j] + sn*udata[i*ustride + j+1]; // t = cs*U(i, j) + sn*U(i, j+1); // 7
udata[i*ustride + j+1] = -sn*udata[i*ustride + j] +cs*udata[i*ustride + j+1]; // U(i, j+1) = -sn*U(i, j) + cs*U(i, j+1); // 8
udata[i*ustride + j] = t; // U(i, j) = t; // 1
}*/
}
}
e(p-2) = f;
iter = iter + 1;
}
break;
// Convergence.
case 4: {
// Make the singular values positive.
if (s(k) <= 0.0) {
s(k) = (s(k) < 0.0 ? -s(k) : 0.0);
if (wantv) {
for (i = 0; i <= pp; i++) {
V(i, k) = -V(i, k);
}
}
}
// Order the singular values.
while (k < pp) {
if (s(k) >= s(k+1)) {
break;
}
Real t = s(k);
s(k) = s(k+1);
s(k+1) = t;
if (wantv && (k < n-1)) {
for (i = 0; i < n; i++) {
t = V(i, k+1); V(i, k+1) = V(i, k); V(i, k) = t;
}
}
if (wantu && (k < m-1)) {
for (i = 0; i < m; i++) {
t = U(i, k+1); U(i, k+1) = U(i, k); U(i, k) = t;
}
}
k++;
}
iter = 0;
p--;
}
break;
}
}
return true;
}
#endif // defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
} // namespace kaldi
#endif // KALDI_MATRIX_JAMA_SVD_H_

@ -1,139 +0,0 @@
// matrix/kaldi-blas.h
// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_KALDI_BLAS_H_
#define KALDI_MATRIX_KALDI_BLAS_H_
// This file handles the #includes for BLAS, LAPACK and so on.
// It manipulates the declarations into a common format that kaldi can handle.
// However, the kaldi code will check whether HAVE_ATLAS is defined as that
// code is called a bit differently from CLAPACK that comes from other sources.
// There are three alternatives:
// (i) you have ATLAS, which includes the ATLAS implementation of CBLAS
// plus a subset of CLAPACK (but with clapack_ in the function declarations).
// In this case, define HAVE_ATLAS and make sure the relevant directories are
// in the include path.
// (ii) you have CBLAS (some implementation thereof) plus CLAPACK.
// In this case, define HAVE_CLAPACK.
// [Since CLAPACK depends on BLAS, the presence of BLAS is implicit].
// (iii) you have the MKL library, which includes CLAPACK and CBLAS.
// Note that if we are using ATLAS, no Svd implementation is supplied,
// so we define HAVE_Svd to be zero and this directs our implementation to
// supply its own "by hand" implementation which is based on TNT code.
#define HAVE_OPENBLAS
#if (defined(HAVE_CLAPACK) && (defined(HAVE_ATLAS) || defined(HAVE_MKL))) \
|| (defined(HAVE_ATLAS) && defined(HAVE_MKL))
#error "Do not define more than one of HAVE_CLAPACK, HAVE_ATLAS and HAVE_MKL"
#endif
#ifdef HAVE_ATLAS
extern "C" {
#include "cblas.h"
#include "clapack.h"
}
#elif defined(HAVE_CLAPACK)
#ifdef __APPLE__
#ifndef __has_extension
#define __has_extension(x) 0
#endif
#define vImage_Utilities_h
#define vImage_CVUtilities_h
#include <Accelerate/Accelerate.h>
typedef __CLPK_integer integer;
typedef __CLPK_logical logical;
typedef __CLPK_real real;
typedef __CLPK_doublereal doublereal;
typedef __CLPK_complex complex;
typedef __CLPK_doublecomplex doublecomplex;
typedef __CLPK_ftnlen ftnlen;
#else
extern "C" {
// May be in /usr/[local]/include if installed; else this uses the one
// from the tools/CLAPACK_include directory.
#include <cblas.h>
#include <f2c.h>
#include <clapack.h>
// get rid of macros from f2c.h -- these are dangerous.
#undef abs
#undef dabs
#undef min
#undef max
#undef dmin
#undef dmax
#undef bit_test
#undef bit_clear
#undef bit_set
}
#endif
#elif defined(HAVE_MKL)
extern "C" {
#include <mkl.h>
}
#elif defined(HAVE_OPENBLAS)
// getting cblas.h and lapacke.h from <openblas-install-dir>/.
// putting in "" not <> to search -I before system libraries.
#if defined(_MSC_VER)
#include <complex.h>
#define LAPACK_COMPLEX_CUSTOM
#define lapack_complex_float _Fcomplex
#define lapack_complex_double _Dcomplex
#endif
#include "cblas.h"
#include "lapacke.h"
#undef I
#undef complex
// get rid of macros from f2c.h -- these are dangerous.
#undef abs
#undef dabs
#undef min
#undef max
#undef dmin
#undef dmax
#undef bit_test
#undef bit_clear
#undef bit_set
#else
#error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)"
#endif
#ifdef HAVE_OPENBLAS
typedef int KaldiBlasInt; // try int.
#endif
#ifdef HAVE_CLAPACK
typedef integer KaldiBlasInt;
#endif
#ifdef HAVE_MKL
typedef MKL_INT KaldiBlasInt;
#endif
#ifdef HAVE_ATLAS
// in this case there is no need for KaldiBlasInt-- this typedef is only needed
// for Svd code which is not included in ATLAS (we re-implement it).
#endif
#endif // KALDI_MATRIX_KALDI_BLAS_H_

@ -1,612 +0,0 @@
// matrix/kaldi-vector.h
// Copyright 2009-2012 Ondrej Glembek; Microsoft Corporation; Lukas Burget;
// Saarland University (Author: Arnab Ghoshal);
// Ariya Rastrow; Petr Schwarz; Yanmin Qian;
// Karel Vesely; Go Vivace Inc.; Arnab Ghoshal
// Wei Shi;
// 2015 Guoguo Chen
// 2017 Daniel Galvez
// 2019 Yiwen Shao
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_KALDI_VECTOR_H_
#define KALDI_MATRIX_KALDI_VECTOR_H_ 1
#include "matrix/matrix-common.h"
namespace kaldi {
/// \addtogroup matrix_group
/// @{
/// Provides a vector abstraction class.
/// This class provides a way to work with vectors in kaldi.
/// It encapsulates basic operations and memory optimizations.
template<typename Real>
class VectorBase {
public:
/// Set vector to all zeros.
void SetZero();
/// Returns true if matrix is all zeros.
bool IsZero(Real cutoff = 1.0e-06) const; // replace magic number
/// Set all members of a vector to a specified value.
void Set(Real f);
/// Set vector to random normally-distributed noise.
void SetRandn();
/// Sets to numbers uniformly distributed on (0,1)
void SetRandUniform();
/// This function returns a random index into this vector,
/// chosen with probability proportional to the corresponding
/// element. Requires that this->Min() >= 0 and this->Sum() > 0.
MatrixIndexT RandCategorical() const;
/// Returns the dimension of the vector.
inline MatrixIndexT Dim() const { return dim_; }
/// Returns the size in memory of the vector, in bytes.
inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
/// Returns a pointer to the start of the vector's data.
inline Real* Data() { return data_; }
/// Returns a pointer to the start of the vector's data (const).
inline const Real* Data() const { return data_; }
/// Indexing operator (const).
inline Real operator() (MatrixIndexT i) const {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(dim_));
return *(data_ + i);
}
/// Indexing operator (non-const).
inline Real & operator() (MatrixIndexT i) {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(dim_));
return *(data_ + i);
}
/** @brief Returns a sub-vector of a vector (a range of elements).
* @param o [in] Origin, 0 < o < Dim()
* @param l [in] Length 0 < l < Dim()-o
* @return A SubVector object that aliases the data of the Vector object.
* See @c SubVector class for details */
SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
return SubVector<Real>(*this, o, l);
}
/** @brief Returns a const sub-vector of a vector (a range of elements).
* @param o [in] Origin, 0 < o < Dim()
* @param l [in] Length 0 < l < Dim()-o
* @return A SubVector object that aliases the data of the Vector object.
* See @c SubVector class for details */
const SubVector<Real> Range(const MatrixIndexT o,
const MatrixIndexT l) const {
return SubVector<Real>(*this, o, l);
}
/// Copy data from another vector (must match own size).
void CopyFromVec(const VectorBase<Real> &v);
/// Copy data from a SpMatrix or TpMatrix (must match own size).
template<typename OtherReal>
void CopyFromPacked(const PackedMatrix<OtherReal> &M);
/// Copy data from another vector of different type (double vs. float)
template<typename OtherReal>
void CopyFromVec(const VectorBase<OtherReal> &v);
/// Copy from CuVector. This is defined in ../cudamatrix/cu-vector.h
template<typename OtherReal>
void CopyFromVec(const CuVectorBase<OtherReal> &v);
/// Applies floor to all elements. Returns number of elements
/// floored in floored_count if it is non-null.
void Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count = nullptr);
/// Applies ceiling to all elements. Returns number of elements
/// changed in ceiled_count if it is non-null.
void Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
void Pow(const VectorBase<Real> &v, Real power);
/// Apply natural log to all elements. Throw if any element of
/// the vector is negative (but doesn't complain about zero; the
/// log will be -infinity
void ApplyLog();
/// Apply natural log to another vector and put result in *this.
void ApplyLogAndCopy(const VectorBase<Real> &v);
/// Apply exponential to each value in vector.
void ApplyExp();
/// Take absolute value of each of the elements
void ApplyAbs();
/// Applies floor to all elements. Returns number of elements
/// floored in floored_count if it is non-null.
inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) {
this->Floor(*this, floor_val, floored_count);
};
/// Applies ceiling to all elements. Returns number of elements
/// changed in ceiled_count if it is non-null.
inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) {
this->Ceiling(*this, ceil_val, ceiled_count);
};
/// Applies floor to all elements. Returns number of elements floored.
MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
/// Apply soft-max to vector and return normalizer (log sum of exponentials).
/// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
Real ApplySoftMax();
/// Applies log soft-max to vector and returns normalizer (log sum of
/// exponentials).
/// This is the same as: \f$ x(i) = x(i) - log(\sum_i exp(x(i))) \f$
Real ApplyLogSoftMax();
/// Sets each element of *this to the tanh of the corresponding element of "src".
void Tanh(const VectorBase<Real> &src);
/// Sets each element of *this to the sigmoid function of the corresponding
/// element of "src".
void Sigmoid(const VectorBase<Real> &src);
/// Take all elements of vector to a power.
inline void ApplyPow(Real power) {
this->Pow(*this, power);
};
/// Take the absolute value of all elements of a vector to a power.
/// Include the sign of the input element if include_sign == true.
/// If power is negative and the input value is zero, the output is set zero.
void ApplyPowAbs(Real power, bool include_sign=false);
/// Compute the p-th norm of the vector.
Real Norm(Real p) const;
/// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0).
bool ApproxEqual(const VectorBase<Real> &other, float tol = 0.01) const;
/// Invert all elements.
void InvertElements();
/// Add vector : *this = *this + alpha * rv (with casting between floats and
/// doubles)
template<typename OtherReal>
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring].
void AddVec2(const Real alpha, const VectorBase<Real> &v);
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring],
/// with casting between floats and doubles.
template<typename OtherReal>
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
/// Add matrix times vector : this <-- beta*this + alpha*M*v.
/// Calls BLAS GEMV.
void AddMatVec(const Real alpha, const MatrixBase<Real> &M,
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// This is as AddMatVec, except optimized for where v contains a lot
/// of zeros.
void AddMatSvec(const Real alpha, const MatrixBase<Real> &M,
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// Add symmetric positive definite matrix times vector:
/// this <-- beta*this + alpha*M*v. Calls BLAS SPMV.
void AddSpVec(const Real alpha, const SpMatrix<Real> &M,
const VectorBase<Real> &v, const Real beta); // **beta previously defaulted to 0.0**
/// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
/// Works even if rv == *this.
void AddTpVec(const Real alpha, const TpMatrix<Real> &M,
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// Set each element to y = (x == orig ? changed : x).
void ReplaceValue(Real orig, Real changed);
/// Multiply element-by-element by another vector.
void MulElements(const VectorBase<Real> &v);
/// Multiply element-by-element by another vector of different type.
template<typename OtherReal>
void MulElements(const VectorBase<OtherReal> &v);
/// Divide element-by-element by a vector.
void DivElements(const VectorBase<Real> &v);
/// Divide element-by-element by a vector of different type.
template<typename OtherReal>
void DivElements(const VectorBase<OtherReal> &v);
/// Add a constant to each element of a vector.
void Add(Real c);
/// Add element-by-element product of vectors:
// this <-- alpha * v .* r + beta*this .
void AddVecVec(Real alpha, const VectorBase<Real> &v,
const VectorBase<Real> &r, Real beta);
/// Add element-by-element quotient of two vectors.
/// this <---- alpha*v/r + beta*this
void AddVecDivVec(Real alpha, const VectorBase<Real> &v,
const VectorBase<Real> &r, Real beta);
/// Multiplies all elements by this constant.
void Scale(Real alpha);
/// Multiplies this vector by lower-triangular matrix: *this <-- *this *M
void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
/// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
/// and x is the value of *this at output.
/// If trans == kTrans, solves M' x = b.
/// Does not test for M being singular or near-singular, so test it before
/// calling this routine.
void Solve(const TpMatrix<Real> &M, const MatrixTransposeType trans);
/// Performs a row stack of the matrix M
void CopyRowsFromMat(const MatrixBase<Real> &M);
template<typename OtherReal>
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
/// The following is implemented in ../cudamatrix/cu-matrix.cc
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
/// Performs a column stack of the matrix M
void CopyColsFromMat(const MatrixBase<Real> &M);
/// Extracts a row of the matrix M. Could also do this with
/// this->Copy(M[row]).
void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
/// Extracts a row of the matrix M with type conversion.
template<typename OtherReal>
void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
/// Extracts a row of the symmetric matrix S.
template<typename OtherReal>
void CopyRowFromSp(const SpMatrix<OtherReal> &S, MatrixIndexT row);
/// Extracts a column of the matrix M.
template<typename OtherReal>
void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
/// Extracts the diagonal of the matrix M.
void CopyDiagFromMat(const MatrixBase<Real> &M);
/// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
void CopyDiagFromPacked(const PackedMatrix<Real> &M);
/// Extracts the diagonal of a symmetric matrix.
inline void CopyDiagFromSp(const SpMatrix<Real> &M) { CopyDiagFromPacked(M); }
/// Extracts the diagonal of a triangular matrix.
inline void CopyDiagFromTp(const TpMatrix<Real> &M) { CopyDiagFromPacked(M); }
/// Returns the maximum value of any element, or -infinity for the empty vector.
Real Max() const;
/// Returns the maximum value of any element, and the associated index.
/// Error if vector is empty.
Real Max(MatrixIndexT *index) const;
/// Returns the minimum value of any element, or +infinity for the empty vector.
Real Min() const;
/// Returns the minimum value of any element, and the associated index.
/// Error if vector is empty.
Real Min(MatrixIndexT *index) const;
/// Returns sum of the elements
Real Sum() const;
/// Returns sum of the logs of the elements. More efficient than
/// just taking log of each. Will return NaN if any elements are
/// negative.
Real SumLog() const;
/// Does *this = alpha * (sum of rows of M) + beta * *this.
void AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
/// Does *this = alpha * (sum of columns of M) + beta * *this.
void AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
/// Add the diagonal of a matrix times itself:
/// *this = diag(M M^T) + beta * *this (if trans == kNoTrans), or
/// *this = diag(M^T M) + beta * *this (if trans == kTrans).
void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
/// as you would expect.
void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
const MatrixBase<Real> &N, MatrixTransposeType transN,
Real beta = 1.0);
/// Returns log(sum(exp())) without exp overflow
/// If prune > 0.0, ignores terms less than the max - prune.
/// [Note: in future, if prune = 0.0, it will take the max.
/// For now, use -1 if you don't want it to prune.]
Real LogSumExp(Real prune = -1.0) const;
/// Reads from C++ stream (option to add to existing contents).
/// Throws exception on failure
void Read(std::istream &in, bool binary, bool add = false);
/// Writes to C++ stream (option to write in binary).
void Write(std::ostream &Out, bool binary) const;
friend class VectorBase<double>;
friend class VectorBase<float>;
friend class CuVectorBase<Real>;
friend class CuVector<Real>;
protected:
/// Destructor; does not deallocate memory, this is handled by child classes.
/// This destructor is protected so this object can only be
/// deleted via a child.
~VectorBase() {}
/// Empty initializer, corresponds to vector of zero size.
explicit VectorBase(): data_(NULL), dim_(0) {
KALDI_ASSERT_IS_FLOATING_TYPE(Real);
}
// Took this out since it is not currently used, and it is possible to create
// objects where the allocated memory is not the same size as dim_ : Arnab
// /// Initializer from a pointer and a size; keeps the pointer internally
// /// (ownership or non-ownership depends on the child class).
// explicit VectorBase(Real* data, MatrixIndexT dim)
// : data_(data), dim_(dim) {}
// Arnab : made this protected since it is unsafe too.
/// Load data into the vector: sz must match own size.
void CopyFromPtr(const Real* Data, MatrixIndexT sz);
/// data memory area
Real* data_;
/// dimension of vector
MatrixIndexT dim_;
KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
}; // class VectorBase
/** @brief A class representing a vector.
*
* This class provides a way to work with vectors in kaldi.
* It encapsulates basic operations and memory optimizations. */
template<typename Real>
class Vector: public VectorBase<Real> {
public:
/// Constructor that takes no arguments. Initializes to empty.
Vector(): VectorBase<Real>() {}
/// Constructor with specific size. Sets to all-zero by default
/// if set_zero == false, memory contents are undefined.
explicit Vector(const MatrixIndexT s,
MatrixResizeType resize_type = kSetZero)
: VectorBase<Real>() { Resize(s, resize_type); }
/// Copy constructor from CUDA vector
/// This is defined in ../cudamatrix/cu-vector.h
template<typename OtherReal>
explicit Vector(const CuVectorBase<OtherReal> &cu);
/// Copy constructor. The need for this is controversial.
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
/// Copy-constructor from base-class, needed to copy from SubVector.
explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
/// Type conversion constructor.
template<typename OtherReal>
explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
// Took this out since it is unsafe : Arnab
// /// Constructor from a pointer and a size; copies the data to a location
// /// it owns.
// Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
// Resize(s);
// CopyFromPtr(Data, s);
// }
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(Vector<Real> *other);
/// Destructor. Deallocates memory.
~Vector() { Destroy(); }
/// Read function using C++ streams. Can also add to existing contents
/// of matrix.
void Read(std::istream &in, bool binary, bool add = false);
/// Set vector to a specified size (can be zero).
/// The value of the new data depends on resize_type:
/// -if kSetZero, the new data will be zero
/// -if kUndefined, the new data will be undefined
/// -if kCopyData, the new data will be the same as the old data in any
/// shared positions, and zero elsewhere.
/// This function takes time proportional to the number of data elements.
void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
/// Remove one element and shifts later elements down.
void RemoveElement(MatrixIndexT i);
/// Assignment operator.
Vector<Real> &operator = (const Vector<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
/// Assignment operator that takes VectorBase.
Vector<Real> &operator = (const VectorBase<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
private:
/// Init assumes the current contents of the class are invalid (i.e. junk or
/// has already been freed), and it sets the vector to newly allocated memory
/// with the specified dimension. dim == 0 is acceptable. The memory contents
/// pointed to by data_ will be undefined.
void Init(const MatrixIndexT dim);
/// Destroy function, called internally.
void Destroy();
};
/// Represents a non-allocating general vector which can be defined
/// as a sub-vector of higher-level vector [or as the row of a matrix].
template<typename Real>
class SubVector : public VectorBase<Real> {
public:
/// Constructor from a Vector or SubVector.
/// SubVectors are not const-safe and it's very hard to make them
/// so for now we just give up. This function contains const_cast.
SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
const MatrixIndexT length) : VectorBase<Real>() {
// following assert equiv to origin>=0 && length>=0 &&
// origin+length <= rt.dim_
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
static_cast<UnsignedMatrixIndexT>(length) <=
static_cast<UnsignedMatrixIndexT>(t.Dim()));
VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
VectorBase<Real>::dim_ = length;
}
/// This constructor initializes the vector to point at the contents
/// of this packed matrix (SpMatrix or TpMatrix).
SubVector(const PackedMatrix<Real> &M) {
VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
VectorBase<Real>::dim_ = (M.NumRows()*(M.NumRows()+1))/2;
}
/// Copy constructor
SubVector(const SubVector &other) : VectorBase<Real> () {
// this copy constructor needed for Range() to work in base class.
VectorBase<Real>::data_ = other.data_;
VectorBase<Real>::dim_ = other.dim_;
}
/// Constructor from a pointer to memory and a length. Keeps a pointer
/// to the data but does not take ownership (will never delete).
/// Caution: this constructor enables you to evade const constraints.
SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
VectorBase<Real>::data_ = const_cast<Real*>(data);
VectorBase<Real>::dim_ = length;
}
/// This operation does not preserve const-ness, so be careful.
SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
VectorBase<Real>::dim_ = matrix.NumCols();
}
~SubVector() {} ///< Destructor (does nothing; no pointers are owned here).
private:
/// Disallow assignment operator.
SubVector & operator = (const SubVector &other) {}
};
/// @} end of "addtogroup matrix_group"
/// \addtogroup matrix_funcs_io
/// @{
/// Output to a C++ stream. Non-binary by default (use Write for
/// binary output).
template<typename Real>
std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
/// Input from a C++ stream. Will automatically read text or
/// binary data from the stream.
template<typename Real>
std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
/// Input from a C++ stream. Will automatically read text or
/// binary data from the stream.
template<typename Real>
std::istream & operator >> (std::istream & in, Vector<Real> & v);
/// @} end of \addtogroup matrix_funcs_io
/// \addtogroup matrix_funcs_scalar
/// @{
template<typename Real>
bool ApproxEqual(const VectorBase<Real> &a,
const VectorBase<Real> &b, Real tol = 0.01) {
return a.ApproxEqual(b, tol);
}
template<typename Real>
inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
float tol = 0.01) {
KALDI_ASSERT(a.ApproxEqual(b, tol));
}
/// Returns dot product between v1 and v2.
template<typename Real>
Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
template<typename Real, typename OtherReal>
Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
/// Returns \f$ v_1^T M v_2 \f$ .
/// Not as efficient as it could be where v1 == v2.
template<typename Real>
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
const VectorBase<Real> &v2);
/// @} End of "addtogroup matrix_funcs_scalar"
} // namespace kaldi
// we need to include the implementation
#include "matrix/kaldi-vector-inl.h"
#endif // KALDI_MATRIX_KALDI_VECTOR_H_

@ -1,56 +0,0 @@
// matrix/matrix-functions-inl.h
// Copyright 2009-2011 Microsoft Corporation
//
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//
// (*) incorporates, with permission, FFT code from his book
// "Signal Processing with Lapped Transforms", Artech, 1992.
#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
#define KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
namespace kaldi {
//! ComplexMul implements, inline, the complex multiplication b *= a.
template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
Real *b_re, Real *b_im) {
Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
*b_im = *b_re * a_im + *b_im * a_re;
*b_re = tmp_re;
}
template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
const Real &b_re, const Real &b_im,
Real *c_re, Real *c_im) {
*c_re += b_re*a_re - b_im*a_im;
*c_im += b_re*a_im + b_im*a_re;
}
template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
*a_re = std::cos(x);
*a_im = std::sin(x);
}
} // end namespace kaldi
#endif // KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_

@ -1,773 +0,0 @@
// matrix/matrix-functions.cc
// Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc.; Jan Silovsky
// Yanmin Qian; Saarland University; Johns Hopkins University (Author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//
// (*) incorporates, with permission, FFT code from his book
// "Signal Processing with Lapped Transforms", Artech, 1992.
#include "matrix/matrix-functions.h"
#include "matrix/sp-matrix.h"
namespace kaldi {
template<typename Real> void ComplexFt (const VectorBase<Real> &in,
VectorBase<Real> *out, bool forward) {
int exp_sign = (forward ? -1 : 1);
KALDI_ASSERT(out != NULL);
KALDI_ASSERT(in.Dim() == out->Dim());
KALDI_ASSERT(in.Dim() % 2 == 0);
int twoN = in.Dim(), N = twoN / 2;
const Real *data_in = in.Data();
Real *data_out = out->Data();
Real exp1N_re, exp1N_im; // forward -> exp(-2pi / N), backward -> exp(2pi / N).
Real fraction = exp_sign * M_2PI / static_cast<Real>(N); // forward -> -2pi/N, backward->-2pi/N
ComplexImExp(fraction, &exp1N_re, &exp1N_im);
Real expm_re = 1.0, expm_im = 0.0; // forward -> exp(-2pi m / N).
for (int two_m = 0; two_m < twoN; two_m+=2) { // For each output component.
Real expmn_re = 1.0, expmn_im = 0.0; // forward -> exp(-2pi m n / N).
Real sum_re = 0.0, sum_im = 0.0; // complex output for index m (the sum expression)
for (int two_n = 0; two_n < twoN; two_n+=2) {
ComplexAddProduct(data_in[two_n], data_in[two_n+1],
expmn_re, expmn_im,
&sum_re, &sum_im);
ComplexMul(expm_re, expm_im, &expmn_re, &expmn_im);
}
data_out[two_m] = sum_re;
data_out[two_m + 1] = sum_im;
if (two_m % 10 == 0) { // occasionally renew "expm" from scratch to avoid
// loss of precision.
int nextm = 1 + two_m/2;
Real fraction_mult = fraction * nextm;
ComplexImExp(fraction_mult, &expm_re, &expm_im);
} else {
ComplexMul(exp1N_re, exp1N_im, &expm_re, &expm_im);
}
}
}
template
void ComplexFt (const VectorBase<float> &in,
VectorBase<float> *out, bool forward);
template
void ComplexFt (const VectorBase<double> &in,
VectorBase<double> *out, bool forward);
#define KALDI_COMPLEXFFT_BLOCKSIZE 8192
// This #define affects how we recurse in ComplexFftRecursive.
// We assume that memory-caching happens on a scale at
// least as small as this.
//! ComplexFftRecursive is a recursive function that computes the
//! complex FFT of size N. The "nffts" arguments specifies how many
//! separate FFTs to compute in parallel (we assume the data for
//! each one is consecutive in memory). The "forward argument"
//! specifies whether to do the FFT (true) or IFFT (false), although
//! note that we do not include the factor of 1/N (the user should
//! do this if required. The iterators factor_begin and factor_end
//! point to the beginning and end (i.e. one past the last element)
//! of an array of small factors of N (typically prime factors).
//! See the comments below this code for the detailed equations
//! of the recursion.
template<typename Real>
void ComplexFftRecursive (Real *data, int nffts, int N,
const int *factor_begin,
const int *factor_end, bool forward,
Vector<Real> *tmp_vec) {
if (factor_begin == factor_end) {
KALDI_ASSERT(N == 1);
return;
}
{ // an optimization: compute in smaller blocks.
// this block of code could be removed and it would still work.
MatrixIndexT size_perblock = N * 2 * sizeof(Real);
if (nffts > 1 && size_perblock*nffts > KALDI_COMPLEXFFT_BLOCKSIZE) { // can break it up...
// Break up into multiple blocks. This is an optimization. We make
// no progress on the FFT when we do this.
int block_skip = KALDI_COMPLEXFFT_BLOCKSIZE / size_perblock; // n blocks per call
if (block_skip == 0) block_skip = 1;
if (block_skip < nffts) {
int blocks_left = nffts;
while (blocks_left > 0) {
int skip_now = std::min(blocks_left, block_skip);
ComplexFftRecursive(data, skip_now, N, factor_begin, factor_end, forward, tmp_vec);
blocks_left -= skip_now;
data += skip_now * N*2;
}
return;
} // else do the actual algorithm.
} // else do the actual algorithm.
}
int P = *factor_begin;
KALDI_ASSERT(P > 1);
int Q = N / P;
if (P > 1 && Q > 1) { // Do the rearrangement. C.f. eq. (8) below. Transform
// (a) to (b).
Real *data_thisblock = data;
if (tmp_vec->Dim() < (MatrixIndexT)N) tmp_vec->Resize(N);
Real *data_tmp = tmp_vec->Data();
for (int thisfft = 0; thisfft < nffts; thisfft++, data_thisblock+=N*2) {
for (int offset = 0; offset < 2; offset++) { // 0 == real, 1 == im.
for (int p = 0; p < P; p++) {
for (int q = 0; q < Q; q++) {
int aidx = q*P + p, bidx = p*Q + q;
data_tmp[bidx] = data_thisblock[2*aidx+offset];
}
}
for (int n = 0;n < P*Q;n++) data_thisblock[2*n+offset] = data_tmp[n];
}
}
}
{ // Recurse.
ComplexFftRecursive(data, nffts*P, Q, factor_begin+1, factor_end, forward, tmp_vec);
}
int exp_sign = (forward ? -1 : 1);
Real rootN_re, rootN_im; // Nth root of unity.
ComplexImExp(static_cast<Real>(exp_sign * M_2PI / N), &rootN_re, &rootN_im);
Real rootP_re, rootP_im; // Pth root of unity.
ComplexImExp(static_cast<Real>(exp_sign * M_2PI / P), &rootP_re, &rootP_im);
{ // Do the multiplication
// could avoid a bunch of complex multiplies by moving the loop over data_thisblock
// inside.
if (tmp_vec->Dim() < (MatrixIndexT)(P*2)) tmp_vec->Resize(P*2);
Real *temp_a = tmp_vec->Data();
Real *data_thisblock = data, *data_end = data+(N*2*nffts);
for (; data_thisblock != data_end; data_thisblock += N*2) { // for each separate fft.
Real qd_re = 1.0, qd_im = 0.0; // 1^(q'/N)
for (int qd = 0; qd < Q; qd++) {
Real pdQ_qd_re = qd_re, pdQ_qd_im = qd_im; // 1^((p'Q+q') / N) == 1^((p'/P) + (q'/N))
// Initialize to q'/N, corresponding to p' == 0.
for (int pd = 0; pd < P; pd++) { // pd == p'
{ // This is the p = 0 case of the loop below [an optimization].
temp_a[pd*2] = data_thisblock[qd*2];
temp_a[pd*2 + 1] = data_thisblock[qd*2 + 1];
}
{ // This is the p = 1 case of the loop below [an optimization]
// **** MOST OF THE TIME (>60% I think) gets spent here. ***
ComplexAddProduct(pdQ_qd_re, pdQ_qd_im,
data_thisblock[(qd+Q)*2], data_thisblock[(qd+Q)*2 + 1],
&(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
}
if (P > 2) {
Real p_pdQ_qd_re = pdQ_qd_re, p_pdQ_qd_im = pdQ_qd_im; // 1^(p(p'Q+q')/N)
for (int p = 2; p < P; p++) {
ComplexMul(pdQ_qd_re, pdQ_qd_im, &p_pdQ_qd_re, &p_pdQ_qd_im); // p_pdQ_qd *= pdQ_qd.
int data_idx = p*Q + qd;
ComplexAddProduct(p_pdQ_qd_re, p_pdQ_qd_im,
data_thisblock[data_idx*2], data_thisblock[data_idx*2 + 1],
&(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
}
}
if (pd != P-1)
ComplexMul(rootP_re, rootP_im, &pdQ_qd_re, &pdQ_qd_im); // pdQ_qd *= (rootP == 1^{1/P})
// (using 1/P == Q/N)
}
for (int pd = 0; pd < P; pd++) {
data_thisblock[(pd*Q + qd)*2] = temp_a[pd*2];
data_thisblock[(pd*Q + qd)*2 + 1] = temp_a[pd*2 + 1];
}
ComplexMul(rootN_re, rootN_im, &qd_re, &qd_im); // qd *= rootN.
}
}
}
}
/* Equations for ComplexFftRecursive.
We consider here one of the "nffts" separate ffts; it's just a question of
doing them all in parallel. We also write all equations in terms of
complex math (the conversion to real arithmetic is not hard, and anyway
takes place inside function calls).
Let the input (i.e. "data" at start) be a_n, n = 0..N-1, and
the output (Fourier transform) be d_k, k = 0..N-1. We use these letters because
there will be two intermediate variables b and c.
We want to compute:
d_k = \sum_n a_n 1^(kn/N) (1)
where we use 1^x as shorthand for exp(-2pi x) for the forward algorithm
and exp(2pi x) for the backward one.
We factorize N = P Q (P small, Q usually large).
With p = 0..P-1 and q = 0..Q-1, and also p'=0..P-1 and q'=0..P-1, we let:
k == p'Q + q' (2)
n == qP + p (3)
That is, we let p, q, p', q' range over these indices and observe that this way we
can cover all n, k. Expanding (1) using (2) and (3), we can write:
d_k = \sum_{p, q} a_n 1^((p'Q+q')(qP+p)/N)
= \sum_{p, q} a_n 1^(p'pQ/N) 1^(q'qP/N) 1^(q'p/N) (4)
using 1^(PQ/N) = 1 to get rid of the terms with PQ in them. Rearranging (4),
d_k = \sum_p 1^(p'pQ/N) 1^(q'p/N) \sum_q 1^(q'qP/N) a_n (5)
The point here is to separate the index q. Now we can expand out the remaining
instances of k and n using (2) and (3):
d_(p'Q+q') = \sum_p 1^(p'pQ/N) 1^(q'p/N) \sum_q 1^(q'qP/N) a_(qP+p) (6)
The expression \sum_q varies with the indices p and q'. Let us define
C_{p, q'} = \sum_q 1^(q'qP/N) a_(qP+p) (7)
Here, C_{p, q'}, viewed as a sequence in q', is just the DFT of the points
a_(qP+p) for q = 1..Q-1. These points are not consecutive in memory though,
they jump by P each time. Let us define b as a rearranged version of a,
so that
b_(pQ+q) = a_(qP+p) (8)
How to do this rearrangement in place? In
We can rearrange (7) to be written in terms of the b's, using (8), so that
C_{p, q'} = \sum_q 1^(q'q (P/N)) b_(pQ+q) (9)
Here, the sequence of C_{p, q'} over q'=0..Q-1, is just the DFT of the sequence
of b_(pQ) .. b_(p(Q+1)-1). Let's arrange the C_{p, q'} in a single array in
memory in the same way as the b's, i.e. we define
c_(pQ+q') == C_{p, q'}. (10)
Note that we could have written (10) with q in place of q', as there is only
one index of type q present, but q' is just a more natural variable name to use
since we use q' elsewhere to subscript c and C.
Rewriting (9), we have:
c_(pQ+q') = \sum_q 1^(q'q (P/N)) b_(pQ+q) (11)
which is the DFT computed by the recursive call to this function [after computing
the b's by rearranging the a's]. From the c's we want to compute the d's.
Taking (6), substituting in the sum (7), and using (10) to write it as an array,
we have:
d_(p'Q+q') = \sum_p 1^(p'pQ/N) 1^(q'p/N) c_(pQ+q') (12)
This sum is independent for different values of q'. Note that d overwrites c
in memory. We compute this in a direct way, using a little array of size P to
store the computed d values for one value of q' (we reuse the array for each value
of q').
So the overall picture is this:
We get a call to compute DFT on size N.
- If N == 1 we return (nothing to do).
- We factor N = P Q (typically, P is small).
- Using (8), we rearrange the data in memory so that we have b not a in memory
(this is the block "do the rearrangement").
The pseudocode for this is as follows. For simplicity we use a temporary array.
for p = 0..P-1
for q = 0..Q-1
bidx = pQ + q
aidx = qP + p
tmp[bidx] = data[aidx].
end
end
data <-- tmp
else
endif
The reason this accomplishes (8) is that we want pQ+q and qP+p to be swapped
over for each p, q, and the "if m > n" is a convenient way of ensuring that
this swapping happens only once (otherwise it would happen twice, since pQ+q
and qP+p both range over the entire set of numbers 0..N-1).
- We do the DFT on the smaller block size to compute c from b (this eq eq. (11)).
Note that this is actually multiple DFTs, one for each value of p, but this
goes to the "nffts" argument of the function call, which we have ignored up to now.
-We compute eq. (12) via a loop, as follows
allocate temporary array e of size P.
For q' = 0..Q-1:
for p' = 0..P-1:
set sum to zero [this will go in e[p']]
for p = p..P-1:
sum += 1^(p'pQ/N) 1^(q'p/N) c_(pQ+q')
end
e[p'] = sum
end
for p' = 0..P-1:
d_(p'Q+q') = e[p']
end
end
delete temporary array e
*/
// This is the outer-layer calling code for ComplexFftRecursive.
// It factorizes the dimension and then calls the FFT routine.
template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
KALDI_ASSERT(v != NULL);
if (v->Dim()<=1) return;
KALDI_ASSERT(v->Dim() % 2 == 0); // complex input.
int N = v->Dim() / 2;
std::vector<int> factors;
Factorize(N, &factors);
int *factor_beg = NULL;
if (factors.size() > 0)
factor_beg = &(factors[0]);
Vector<Real> tmp; // allocated in ComplexFftRecursive.
ComplexFftRecursive(v->Data(), 1, N, factor_beg, factor_beg+factors.size(), forward, (tmp_in?tmp_in:&tmp));
}
//! Inefficient version of Fourier transform, for testing purposes.
template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
KALDI_ASSERT(v != NULL);
MatrixIndexT N = v->Dim();
KALDI_ASSERT(N%2 == 0);
if (N == 0) return;
Vector<Real> vtmp(N*2); // store as complex.
if (forward) {
for (MatrixIndexT i = 0; i < N; i++) vtmp(i*2) = (*v)(i);
ComplexFft(&vtmp, forward); // this is already tested so we can use this.
v->CopyFromVec( vtmp.Range(0, N) );
(*v)(1) = vtmp(N); // Copy the N/2'th fourier component, which is real,
// to the imaginary part of the 1st complex output.
} else {
// reverse the transformation above to get the complex spectrum.
vtmp(0) = (*v)(0); // copy F_0 which is real
vtmp(N) = (*v)(1); // copy F_{N/2} which is real
for (MatrixIndexT i = 1; i < N/2; i++) {
// Copy i'th to i'th fourier component
vtmp(2*i) = (*v)(2*i);
vtmp(2*i+1) = (*v)(2*i+1);
// Copy i'th to N-i'th, conjugated.
vtmp(2*(N-i)) = (*v)(2*i);
vtmp(2*(N-i)+1) = -(*v)(2*i+1);
}
ComplexFft(&vtmp, forward); // actually backward since forward == false
// Copy back real part. Complex part should be zero.
for (MatrixIndexT i = 0; i < N; i++)
(*v)(i) = vtmp(i*2);
}
}
template void RealFftInefficient (VectorBase<float> *v, bool forward);
template void RealFftInefficient (VectorBase<double> *v, bool forward);
template
void ComplexFft(VectorBase<float> *v, bool forward, Vector<float> *tmp_in);
template
void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
// See the long comment below for the math behind this.
template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
KALDI_ASSERT(v != NULL);
MatrixIndexT N = v->Dim(), N2 = N/2;
KALDI_ASSERT(N%2 == 0);
if (N == 0) return;
if (forward) ComplexFft(v, true);
Real *data = v->Data();
Real rootN_re, rootN_im; // exp(-2pi/N), forward; exp(2pi/N), backward
int forward_sign = forward ? -1 : 1;
ComplexImExp(static_cast<Real>(M_2PI/N *forward_sign), &rootN_re, &rootN_im);
Real kN_re = -forward_sign, kN_im = 0.0; // exp(-2pik/N), forward; exp(-2pik/N), backward
// kN starts out as 1.0 for forward algorithm but -1.0 for backward.
for (MatrixIndexT k = 1; 2*k <= N2; k++) {
ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im);
Real Ck_re, Ck_im, Dk_re, Dk_im;
// C_k = 1/2 (B_k + B_{N/2 - k}^*) :
Ck_re = 0.5 * (data[2*k] + data[N - 2*k]);
Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]);
// re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})):
Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]);
// im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))
Dk_im =-0.5 * (data[2*k] - data[N - 2*k]);
// A_k = C_k + 1^(k/N) D_k:
data[2*k] = Ck_re; // A_k <-- C_k
data[2*k+1] = Ck_im;
// now A_k += D_k 1^(k/N)
ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1]));
MatrixIndexT kdash = N2 - k;
if (kdash != k) {
// Next we handle the index k' = N/2 - k. This is necessary
// to do now, to avoid invalidating data that we will later need.
// The quantities C_{k'} and D_{k'} are just the conjugates of C_k
// and D_k, so the equations are simple modifications of the above,
// replacing Ck_im and Dk_im with their negatives.
data[2*kdash] = Ck_re; // A_k' <-- C_k'
data[2*kdash+1] = -Ck_im;
// now A_k' += D_k' 1^(k'/N)
// We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^*
// so it's the same as 1^(k/N) but with the real part negated.
ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1]));
}
}
{ // Now handle k = 0.
// In simple terms: after the complex fft, data[0] becomes the sum of real
// parts input[0], input[2]... and data[1] becomes the sum of imaginary
// pats input[1], input[3]...
// "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2]..
// and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... .
Real zeroth = data[0] + data[1],
n2th = data[0] - data[1];
data[0] = zeroth;
data[1] = n2th;
if (!forward) {
data[0] /= 2;
data[1] /= 2;
}
}
if (!forward) {
ComplexFft(v, false);
v->Scale(2.0); // This is so we get a factor of N increase, rather than N/2 which we would
// otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2.
// It's for consistency with our normal FFT convensions.
}
}
template void RealFft (VectorBase<float> *v, bool forward);
template void RealFft (VectorBase<double> *v, bool forward);
/* Notes for real FFTs.
We are using the same convention as above, 1^x to mean exp(-2\pi x) for the forward transform.
Actually, in a slight abuse of notation, we use this meaning for 1^x in both the forward and
backward cases because it's more convenient in this section.
Suppose we have real data a[0...N-1], with N even, and want to compute its Fourier transform.
We can make do with the first N/2 points of the transform, since the remaining ones are complex
conjugates of the first. We want to compute:
for k = 0...N/2-1,
A_k = \sum_{n = 0}^{N-1} a_n 1^(kn/N) (1)
We treat a[0..N-1] as a complex sequence of length N/2, i.e. a sequence b[0..N/2 - 1].
Viewed as sequences of length N/2, we have:
b = c + i d,
where c = a_0, a_2 ... and d = a_1, a_3 ...
We can recover the length-N/2 Fourier transforms of c and d by doing FT on b and
then doing the equations below. Derivation is marked by (*) in a comment below (search
for it). Let B, C, D be the FTs.
We have
C_k = 1/2 (B_k + B_{N/2 - k}^*) (z0)
D_k =-1/2i (B_k - B_{N/2 - k}^*) (z1)
so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})) (z2)
im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k})) (z3)
To recover the FT A from C and D, we write, rearranging (1):
A_k = \sum_{n = 0, 2, ..., N-2} a_n 1^(kn/N)
+\sum_{n = 1, 3, ..., N-1} a_n 1^(kn/N)
= \sum_{n = 0, 1, ..., N/2-1} a_n 1^(2kn/N) + a_{n+1} 1^(2kn/N) 1^(k/N)
= \sum_{n = 0, 1, ..., N/2-1} c_n 1^(2kn/N) + d_n 1^(2kn/N) 1^(k/N)
A_k = C_k + 1^(k/N) D_k (a0)
This equation is valid for k = 0...N/2-1, which is the range of the sequences B_k and
C_k. We don't use is for k = 0, which is a special case considered below. For
1 < k < N/2, it's convenient to consider the pair k, k', where k' = N/2 - k.
Remember that C_k' = C_k^ *and D_k' = D_k^* [where * is conjugation]. Also,
1^(N/2 / N) = -1. So we have:
A_k' = C_k^* - 1^(k/N) D_k^* (a0b)
We do (a0) and (a0b) together.
By symmetry this gives us the Fourier components for N/2+1, ... N, if we want
them. However, it doesn't give us the value for exactly k = N/2. For k = 0 and k = N/2, it
is easiest to argue directly about the meaning of the A_k, B_k and C_k in terms of
sums of points.
A_0 and A_{N/2} are both real, with A_0=\sum_n a_n, and A_1 an alternating sum
A_1 = a_0 - a_1 + a_2 ...
It's easy to show that
A_0 = B_0 + C_0 (a1)
A_{N/2} = B_0 - C_0. (a2)
Since B_0 and C_0 are both real, B_0 is the real coefficient of D_0 and C_0 is the
imaginary coefficient.
*REVERSING THE PROCESS*
Next we want to reverse this process. We just need to work out C_k and D_k from the
sequence A_k. Then we do the inverse complex fft and we get back where we started.
For 0 and N/2, working from (a1) and (a2) above, we can see that:
B_0 = 1/2 (A_0 + A_{N/2}) (y0)
C_0 = 1/2 (A_0 + A_{N/2}) (y1)
and we use
D_0 = B_0 + i C_0
to get the 1st complex coefficient of D. This is exactly the same as the forward process
except with an extra factor of 1/2.
Consider equations (a0) and (a0b). We want to work out C_k and D_k from A_k and A_k'. Remember
k' = N/2 - k.
Write down
A_k = C_k + 1^(k/N) D_k (copying a0)
A_k'^* = C_k - 1^(k/N) D_k (conjugate of a0b)
So
C_k = 0.5 (A_k + A_k'^*) (p0)
D_k = 1^(-k/N) . 0.5 (A_k - A_k'^*) (p1)
Next, we want to compute B_k and B_k' from C_k and D_k. C.f. (z0)..(z3), and remember
that k' = N/2-k. We can see
that
B_k = C_k + i D_k (p2)
B_k' = C_k - i D_k (p3)
We would like to make the equations (p0) ... (p3) look like the forward equations (z0), (z1),
(a0) and (a0b) so we can reuse the code. Define E_k = -i 1^(k/N) D_k. Then write down (p0)..(p3).
We have
C_k = 0.5 (A_k + A_k'^*) (p0')
E_k = -0.5 i (A_k - A_k'^*) (p1')
B_k = C_k - 1^(-k/N) E_k (p2')
B_k' = C_k + 1^(-k/N) E_k (p3')
So these are exactly the same as (z0), (z1), (a0), (a0b) except replacing 1^(k/N) with
-1^(-k/N) . Remember that we defined 1^x above to be exp(-2pi x/N), so the signs here
might be opposite to what you see in the code.
MODIFICATION: we need to take care of a factor of two. The complex FFT we implemented
does not divide by N in the reverse case. So upon inversion we get larger by N/2.
However, this is not consistent with normal FFT conventions where you get a factor of N.
For this reason we multiply by two after the process described above.
*/
/*
(*) [this token is referred to in a comment above].
Notes for separating 2 real transforms from one complex one. Note that the
letters here (A, B, C and N) are all distinct from the same letters used in the
place where this comment is used.
Suppose we
have two sequences a_n and b_n, n = 0..N-1. We combine them into a complex
number,
c_n = a_n + i b_n.
Then we take the fourier transform to get
C_k = \sum_{n = 0}^{N-1} c_n 1^(n/N) .
Then we use symmetry. Define A_k and B_k as the DFTs of a and b.
We use A_k = A_{N-k}^*, and B_k = B_{N-k}^*, since a and b are real. Using
C_k = A_k + i B_k,
C_{N-k} = A_k^* + i B_k^*
= A_k^* - (i B_k)^*
So:
A_k = 1/2 (C_k + C_{N-k}^*)
i B_k = 1/2 (C_k - C_{N-k}^*)
-> B_k =-1/2i (C_k - C_{N-k}^*)
-> re(B_k) = 1/2 (im(C_k) + im(C_{N-k}))
im(B_k) =-1/2 (re(C_k) - re(C_{N-k}))
*/
template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
//KALDI_ASSERT(M->NumRows() == M->NumCols());
MatrixIndexT K = M->NumRows();
MatrixIndexT N = M->NumCols();
KALDI_ASSERT(K > 0);
KALDI_ASSERT(N > 0);
Real normalizer = std::sqrt(1.0 / static_cast<Real>(N)); // normalizer for
// X_0.
for (MatrixIndexT j = 0; j < N; j++) (*M)(0, j) = normalizer;
normalizer = std::sqrt(2.0 / static_cast<Real>(N)); // normalizer for other
// elements.
for (MatrixIndexT k = 1; k < K; k++)
for (MatrixIndexT n = 0; n < N; n++)
(*M)(k, n) = normalizer
* std::cos( static_cast<double>(M_PI)/N * (n + 0.5) * k );
}
template void ComputeDctMatrix(Matrix<float> *M);
template void ComputeDctMatrix(Matrix<double> *M);
template<typename Real>
void ComputePca(const MatrixBase<Real> &X,
MatrixBase<Real> *U,
MatrixBase<Real> *A,
bool print_eigs,
bool exact) {
// Note that some of these matrices may be transposed w.r.t. the
// way it's most natural to describe them in math... it's the rows
// of X and U that correspond to the (data-points, basis elements).
MatrixIndexT N = X.NumRows(), D = X.NumCols();
// N = #points, D = feature dim.
KALDI_ASSERT(U != NULL && U->NumCols() == D);
MatrixIndexT G = U->NumRows(); // # of retained basis elements.
KALDI_ASSERT(A == NULL || (A->NumRows() == N && A->NumCols() == G));
KALDI_ASSERT(G <= N && G <= D);
if (D < N) { // Do conventional PCA.
SpMatrix<Real> Msp(D); // Matrix of outer products.
Msp.AddMat2(1.0, X, kTrans, 0.0); // M <-- X^T X
Matrix<Real> Utmp;
Vector<Real> l;
if (exact) {
Utmp.Resize(D, D);
l.Resize(D);
//Matrix<Real> M(Msp);
//M.DestructiveSvd(&l, &Utmp, NULL);
Msp.Eig(&l, &Utmp);
} else {
Utmp.Resize(D, G);
l.Resize(G);
Msp.TopEigs(&l, &Utmp);
}
SortSvd(&l, &Utmp);
for (MatrixIndexT g = 0; g < G; g++)
U->Row(g).CopyColFromMat(Utmp, g);
if (print_eigs)
KALDI_LOG << (exact ? "" : "Retained ")
<< "PCA eigenvalues are " << l;
if (A != NULL)
A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0);
} else { // Do inner-product PCA.
SpMatrix<Real> Nsp(N); // Matrix of inner products.
Nsp.AddMat2(1.0, X, kNoTrans, 0.0); // M <-- X X^T
Matrix<Real> Vtmp;
Vector<Real> l;
if (exact) {
Vtmp.Resize(N, N);
l.Resize(N);
Matrix<Real> Nmat(Nsp);
Nmat.DestructiveSvd(&l, &Vtmp, NULL);
} else {
Vtmp.Resize(N, G);
l.Resize(G);
Nsp.TopEigs(&l, &Vtmp);
}
MatrixIndexT num_zeroed = 0;
for (MatrixIndexT g = 0; g < G; g++) {
if (l(g) < 0.0) {
KALDI_WARN << "In PCA, setting element " << l(g) << " to zero.";
l(g) = 0.0;
num_zeroed++;
}
}
SortSvd(&l, &Vtmp); // Make sure zero elements are last, this
// is necessary for Orthogonalize() to work properly later.
Vtmp.Transpose(); // So eigenvalues are the rows.
for (MatrixIndexT g = 0; g < G; g++) {
Real sqrtlg = sqrt(l(g));
if (l(g) != 0.0) {
U->Row(g).AddMatVec(1.0 / sqrtlg, X, kTrans, Vtmp.Row(g), 0.0);
} else {
U->Row(g).SetZero();
(*U)(g, g) = 1.0; // arbitrary direction. Will later orthogonalize.
}
if (A != NULL)
for (MatrixIndexT n = 0; n < N; n++)
(*A)(n, g) = sqrtlg * Vtmp(g, n);
}
// Now orthogonalize. This is mainly useful in
// case there were zero eigenvalues, but we do it
// for all of them.
U->OrthogonalizeRows();
if (print_eigs)
KALDI_LOG << "(inner-product) PCA eigenvalues are " << l;
}
}
template
void ComputePca(const MatrixBase<float> &X,
MatrixBase<float> *U,
MatrixBase<float> *A,
bool print_eigs,
bool exact);
template
void ComputePca(const MatrixBase<double> &X,
MatrixBase<double> *U,
MatrixBase<double> *A,
bool print_eigs,
bool exact);
// Added by Dan, Feb. 13 2012.
// This function does: *plus += max(0, a b^T),
// *minus += max(0, -(a b^T)).
template<typename Real>
void AddOuterProductPlusMinus(Real alpha,
const VectorBase<Real> &a,
const VectorBase<Real> &b,
MatrixBase<Real> *plus,
MatrixBase<Real> *minus) {
KALDI_ASSERT(a.Dim() == plus->NumRows() && b.Dim() == plus->NumCols()
&& a.Dim() == minus->NumRows() && b.Dim() == minus->NumCols());
int32 nrows = a.Dim(), ncols = b.Dim(), pskip = plus->Stride() - ncols,
mskip = minus->Stride() - ncols;
const Real *adata = a.Data(), *bdata = b.Data();
Real *plusdata = plus->Data(), *minusdata = minus->Data();
for (int32 i = 0; i < nrows; i++) {
const Real *btmp = bdata;
Real multiple = alpha * *adata;
if (multiple > 0.0) {
for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) {
if (*btmp > 0.0) *plusdata += multiple * *btmp;
else *minusdata -= multiple * *btmp;
}
} else {
for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) {
if (*btmp < 0.0) *plusdata += multiple * *btmp;
else *minusdata -= multiple * *btmp;
}
}
plusdata += pskip;
minusdata += mskip;
adata++;
}
}
// Instantiate template
template
void AddOuterProductPlusMinus<float>(float alpha,
const VectorBase<float> &a,
const VectorBase<float> &b,
MatrixBase<float> *plus,
MatrixBase<float> *minus);
template
void AddOuterProductPlusMinus<double>(double alpha,
const VectorBase<double> &a,
const VectorBase<double> &b,
MatrixBase<double> *plus,
MatrixBase<double> *minus);
} // end namespace kaldi

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save