[speechx] rm openblas && refactor kaldi-matrix, kaldi-vector (#2824)
* rm openblas && refactor kaldi-matrix kaldi-vectorpull/2854/head
parent
c1b1ae0515
commit
ee7c266f13
@ -1,2 +1,28 @@
|
||||
add_library(kaldi-native-fbank-core
|
||||
feature-fbank.cc
|
||||
feature-functions.cc
|
||||
feature-window.cc
|
||||
fftsg.c
|
||||
mel-computations.cc
|
||||
rfft.cc
|
||||
)
|
||||
|
||||
add_subdirectory(audio)
|
||||
add_library(frontend STATIC
|
||||
cmvn.cc
|
||||
audio_cache.cc
|
||||
feature_cache.cc
|
||||
feature_pipeline.cc
|
||||
assembler.cc
|
||||
wave-reader.cc
|
||||
)
|
||||
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
|
||||
|
||||
set(BINS
|
||||
compute_fbank_main
|
||||
)
|
||||
|
||||
foreach(bin_name IN LISTS BINS)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
|
||||
endforeach()
|
||||
|
@ -1,27 +0,0 @@
|
||||
add_library(kaldi-native-fbank-core
|
||||
feature-fbank.cc
|
||||
feature-functions.cc
|
||||
feature-window.cc
|
||||
fftsg.c
|
||||
mel-computations.cc
|
||||
rfft.cc
|
||||
)
|
||||
|
||||
add_library(frontend STATIC
|
||||
cmvn.cc
|
||||
audio_cache.cc
|
||||
feature_cache.cc
|
||||
feature_pipeline.cc
|
||||
assembler.cc
|
||||
)
|
||||
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
|
||||
|
||||
set(BINS
|
||||
compute_fbank_main
|
||||
)
|
||||
|
||||
foreach(bin_name IN LISTS BINS)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog kaldi-feat-common)
|
||||
endforeach()
|
@ -0,0 +1,7 @@
|
||||
|
||||
add_library(kaldi-matrix
|
||||
kaldi-matrix.cc
|
||||
kaldi-vector.cc
|
||||
)
|
||||
|
||||
target_link_libraries(kaldi-matrix kaldi-base)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,345 @@
|
||||
// matrix/kaldi-vector.h
|
||||
|
||||
// Copyright 2009-2012 Ondrej Glembek; Microsoft Corporation; Lukas Burget;
|
||||
// Saarland University (Author: Arnab Ghoshal);
|
||||
// Ariya Rastrow; Petr Schwarz; Yanmin Qian;
|
||||
// Karel Vesely; Go Vivace Inc.; Arnab Ghoshal
|
||||
// Wei Shi;
|
||||
// 2015 Guoguo Chen
|
||||
// 2017 Daniel Galvez
|
||||
// 2019 Yiwen Shao
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_MATRIX_KALDI_VECTOR_H_
|
||||
#define KALDI_MATRIX_KALDI_VECTOR_H_ 1
|
||||
|
||||
#include "matrix/matrix-common.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/// \addtogroup matrix_group
|
||||
/// @{
|
||||
|
||||
/// Provides a vector abstraction class.
|
||||
/// This class provides a way to work with vectors in kaldi.
|
||||
/// It encapsulates basic operations and memory optimizations.
|
||||
template<typename Real>
|
||||
class VectorBase {
|
||||
public:
|
||||
/// Set vector to all zeros.
|
||||
void SetZero();
|
||||
|
||||
/// Returns true if matrix is all zeros.
|
||||
bool IsZero(Real cutoff = 1.0e-06) const; // replace magic number
|
||||
|
||||
/// Set all members of a vector to a specified value.
|
||||
void Set(Real f);
|
||||
|
||||
/// Returns the dimension of the vector.
|
||||
inline MatrixIndexT Dim() const { return dim_; }
|
||||
|
||||
/// Returns the size in memory of the vector, in bytes.
|
||||
inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
|
||||
|
||||
/// Returns a pointer to the start of the vector's data.
|
||||
inline Real* Data() { return data_; }
|
||||
|
||||
/// Returns a pointer to the start of the vector's data (const).
|
||||
inline const Real* Data() const { return data_; }
|
||||
|
||||
/// Indexing operator (const).
|
||||
inline Real operator() (MatrixIndexT i) const {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(dim_));
|
||||
return *(data_ + i);
|
||||
}
|
||||
|
||||
/// Indexing operator (non-const).
|
||||
inline Real & operator() (MatrixIndexT i) {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(dim_));
|
||||
return *(data_ + i);
|
||||
}
|
||||
|
||||
/** @brief Returns a sub-vector of a vector (a range of elements).
|
||||
* @param o [in] Origin, 0 < o < Dim()
|
||||
* @param l [in] Length 0 < l < Dim()-o
|
||||
* @return A SubVector object that aliases the data of the Vector object.
|
||||
* See @c SubVector class for details */
|
||||
SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
|
||||
return SubVector<Real>(*this, o, l);
|
||||
}
|
||||
|
||||
/** @brief Returns a const sub-vector of a vector (a range of elements).
|
||||
* @param o [in] Origin, 0 < o < Dim()
|
||||
* @param l [in] Length 0 < l < Dim()-o
|
||||
* @return A SubVector object that aliases the data of the Vector object.
|
||||
* See @c SubVector class for details */
|
||||
const SubVector<Real> Range(const MatrixIndexT o,
|
||||
const MatrixIndexT l) const {
|
||||
return SubVector<Real>(*this, o, l);
|
||||
}
|
||||
|
||||
/// Copy data from another vector (must match own size).
|
||||
void CopyFromVec(const VectorBase<Real> &v);
|
||||
|
||||
/// Copy data from another vector of different type (double vs. float)
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Performs a row stack of the matrix M
|
||||
void CopyRowsFromMat(const MatrixBase<Real> &M);
|
||||
template<typename OtherReal>
|
||||
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
|
||||
|
||||
/// Performs a column stack of the matrix M
|
||||
void CopyColsFromMat(const MatrixBase<Real> &M);
|
||||
|
||||
/// Extracts a row of the matrix M. Could also do this with
|
||||
/// this->Copy(M[row]).
|
||||
void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
|
||||
/// Extracts a row of the matrix M with type conversion.
|
||||
template<typename OtherReal>
|
||||
void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
|
||||
|
||||
/// Extracts a column of the matrix M.
|
||||
template<typename OtherReal>
|
||||
void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
|
||||
|
||||
/// Reads from C++ stream (option to add to existing contents).
|
||||
/// Throws exception on failure
|
||||
void Read(std::istream &in, bool binary);
|
||||
|
||||
/// Writes to C++ stream (option to write in binary).
|
||||
void Write(std::ostream &Out, bool binary) const;
|
||||
|
||||
friend class VectorBase<double>;
|
||||
friend class VectorBase<float>;
|
||||
protected:
|
||||
/// Destructor; does not deallocate memory, this is handled by child classes.
|
||||
/// This destructor is protected so this object can only be
|
||||
/// deleted via a child.
|
||||
~VectorBase() {}
|
||||
|
||||
/// Empty initializer, corresponds to vector of zero size.
|
||||
explicit VectorBase(): data_(NULL), dim_(0) {
|
||||
KALDI_ASSERT_IS_FLOATING_TYPE(Real);
|
||||
}
|
||||
|
||||
/// data memory area
|
||||
Real* data_;
|
||||
/// dimension of vector
|
||||
MatrixIndexT dim_;
|
||||
KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
|
||||
}; // class VectorBase
|
||||
|
||||
/** @brief A class representing a vector.
|
||||
*
|
||||
* This class provides a way to work with vectors in kaldi.
|
||||
* It encapsulates basic operations and memory optimizations. */
|
||||
template<typename Real>
|
||||
class Vector: public VectorBase<Real> {
|
||||
public:
|
||||
/// Constructor that takes no arguments. Initializes to empty.
|
||||
Vector(): VectorBase<Real>() {}
|
||||
|
||||
/// Constructor with specific size. Sets to all-zero by default
|
||||
/// if set_zero == false, memory contents are undefined.
|
||||
explicit Vector(const MatrixIndexT s,
|
||||
MatrixResizeType resize_type = kSetZero)
|
||||
: VectorBase<Real>() { Resize(s, resize_type); }
|
||||
|
||||
/// Copy constructor from CUDA vector
|
||||
/// This is defined in ../cudamatrix/cu-vector.h
|
||||
//template<typename OtherReal>
|
||||
//explicit Vector(const CuVectorBase<OtherReal> &cu);
|
||||
|
||||
/// Copy constructor. The need for this is controversial.
|
||||
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
/// Copy-constructor from base-class, needed to copy from SubVector.
|
||||
explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
/// Type conversion constructor.
|
||||
template<typename OtherReal>
|
||||
explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
// Took this out since it is unsafe : Arnab
|
||||
// /// Constructor from a pointer and a size; copies the data to a location
|
||||
// /// it owns.
|
||||
// Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
|
||||
// Resize(s);
|
||||
// CopyFromPtr(Data, s);
|
||||
// }
|
||||
|
||||
|
||||
/// Swaps the contents of *this and *other. Shallow swap.
|
||||
void Swap(Vector<Real> *other);
|
||||
|
||||
/// Destructor. Deallocates memory.
|
||||
~Vector() { Destroy(); }
|
||||
|
||||
/// Read function using C++ streams. Can also add to existing contents
|
||||
/// of matrix.
|
||||
void Read(std::istream &in, bool binary);
|
||||
|
||||
/// Set vector to a specified size (can be zero).
|
||||
/// The value of the new data depends on resize_type:
|
||||
/// -if kSetZero, the new data will be zero
|
||||
/// -if kUndefined, the new data will be undefined
|
||||
/// -if kCopyData, the new data will be the same as the old data in any
|
||||
/// shared positions, and zero elsewhere.
|
||||
/// This function takes time proportional to the number of data elements.
|
||||
void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
|
||||
|
||||
/// Remove one element and shifts later elements down.
|
||||
void RemoveElement(MatrixIndexT i);
|
||||
|
||||
/// Assignment operator.
|
||||
Vector<Real> &operator = (const Vector<Real> &other) {
|
||||
Resize(other.Dim(), kUndefined);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// Assignment operator that takes VectorBase.
|
||||
Vector<Real> &operator = (const VectorBase<Real> &other) {
|
||||
Resize(other.Dim(), kUndefined);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
/// Init assumes the current contents of the class are invalid (i.e. junk or
|
||||
/// has already been freed), and it sets the vector to newly allocated memory
|
||||
/// with the specified dimension. dim == 0 is acceptable. The memory contents
|
||||
/// pointed to by data_ will be undefined.
|
||||
void Init(const MatrixIndexT dim);
|
||||
|
||||
/// Destroy function, called internally.
|
||||
void Destroy();
|
||||
|
||||
};
|
||||
|
||||
|
||||
/// Represents a non-allocating general vector which can be defined
|
||||
/// as a sub-vector of higher-level vector [or as the row of a matrix].
|
||||
template<typename Real>
|
||||
class SubVector : public VectorBase<Real> {
|
||||
public:
|
||||
/// Constructor from a Vector or SubVector.
|
||||
/// SubVectors are not const-safe and it's very hard to make them
|
||||
/// so for now we just give up. This function contains const_cast.
|
||||
SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
|
||||
const MatrixIndexT length) : VectorBase<Real>() {
|
||||
// following assert equiv to origin>=0 && length>=0 &&
|
||||
// origin+length <= rt.dim_
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
|
||||
static_cast<UnsignedMatrixIndexT>(length) <=
|
||||
static_cast<UnsignedMatrixIndexT>(t.Dim()));
|
||||
VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
|
||||
VectorBase<Real>::dim_ = length;
|
||||
}
|
||||
|
||||
/// This constructor initializes the vector to point at the contents
|
||||
/// of this packed matrix (SpMatrix or TpMatrix).
|
||||
// SubVector(const PackedMatrix<Real> &M) {
|
||||
//VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
|
||||
//VectorBase<Real>::dim_ = (M.NumRows()*(M.NumRows()+1))/2;
|
||||
//}
|
||||
|
||||
/// Copy constructor
|
||||
SubVector(const SubVector &other) : VectorBase<Real> () {
|
||||
// this copy constructor needed for Range() to work in base class.
|
||||
VectorBase<Real>::data_ = other.data_;
|
||||
VectorBase<Real>::dim_ = other.dim_;
|
||||
}
|
||||
|
||||
/// Constructor from a pointer to memory and a length. Keeps a pointer
|
||||
/// to the data but does not take ownership (will never delete).
|
||||
/// Caution: this constructor enables you to evade const constraints.
|
||||
SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
|
||||
VectorBase<Real>::data_ = const_cast<Real*>(data);
|
||||
VectorBase<Real>::dim_ = length;
|
||||
}
|
||||
|
||||
/// This operation does not preserve const-ness, so be careful.
|
||||
SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
|
||||
VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
|
||||
VectorBase<Real>::dim_ = matrix.NumCols();
|
||||
}
|
||||
|
||||
~SubVector() {} ///< Destructor (does nothing; no pointers are owned here).
|
||||
|
||||
private:
|
||||
/// Disallow assignment operator.
|
||||
SubVector & operator = (const SubVector &other) {}
|
||||
};
|
||||
|
||||
/// @} end of "addtogroup matrix_group"
|
||||
/// \addtogroup matrix_funcs_io
|
||||
/// @{
|
||||
/// Output to a C++ stream. Non-binary by default (use Write for
|
||||
/// binary output).
|
||||
template<typename Real>
|
||||
std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
|
||||
|
||||
/// Input from a C++ stream. Will automatically read text or
|
||||
/// binary data from the stream.
|
||||
template<typename Real>
|
||||
std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
|
||||
|
||||
/// Input from a C++ stream. Will automatically read text or
|
||||
/// binary data from the stream.
|
||||
template<typename Real>
|
||||
std::istream & operator >> (std::istream & in, Vector<Real> & v);
|
||||
/// @} end of \addtogroup matrix_funcs_io
|
||||
|
||||
/// \addtogroup matrix_funcs_scalar
|
||||
/// @{
|
||||
|
||||
|
||||
//template<typename Real>
|
||||
//bool ApproxEqual(const VectorBase<Real> &a,
|
||||
//const VectorBase<Real> &b, Real tol = 0.01) {
|
||||
//return a.ApproxEqual(b, tol);
|
||||
//}
|
||||
|
||||
//template<typename Real>
|
||||
//inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
|
||||
//float tol = 0.01) {
|
||||
//KALDI_ASSERT(a.ApproxEqual(b, tol));
|
||||
//}
|
||||
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
// we need to include the implementation
|
||||
#include "matrix/kaldi-vector-inl.h"
|
||||
|
||||
|
||||
|
||||
#endif // KALDI_MATRIX_KALDI_VECTOR_H_
|
@ -1,20 +0,0 @@
|
||||
add_library(kaldi-mfcc
|
||||
feature-mfcc.cc
|
||||
)
|
||||
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
|
||||
|
||||
add_library(kaldi-fbank
|
||||
feature-fbank.cc
|
||||
)
|
||||
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
|
||||
|
||||
add_library(kaldi-feat-common
|
||||
wave-reader.cc
|
||||
signal.cc
|
||||
feature-functions.cc
|
||||
feature-window.cc
|
||||
resample.cc
|
||||
mel-computations.cc
|
||||
cmvn.cc
|
||||
)
|
||||
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
|
@ -1,183 +0,0 @@
|
||||
// transform/cmvn.cc
|
||||
|
||||
// Copyright 2009-2013 Microsoft Corporation
|
||||
// Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "feat/cmvn.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
void InitCmvnStats(int32 dim, Matrix<double> *stats) {
|
||||
KALDI_ASSERT(dim > 0);
|
||||
stats->Resize(2, dim+1);
|
||||
}
|
||||
|
||||
void AccCmvnStats(const VectorBase<BaseFloat> &feats, BaseFloat weight, MatrixBase<double> *stats) {
|
||||
int32 dim = feats.Dim();
|
||||
KALDI_ASSERT(stats != NULL);
|
||||
KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() == dim + 1);
|
||||
// Remove these __restrict__ modifiers if they cause compilation problems.
|
||||
// It's just an optimization.
|
||||
double *__restrict__ mean_ptr = stats->RowData(0),
|
||||
*__restrict__ var_ptr = stats->RowData(1),
|
||||
*__restrict__ count_ptr = mean_ptr + dim;
|
||||
const BaseFloat * __restrict__ feats_ptr = feats.Data();
|
||||
*count_ptr += weight;
|
||||
// Careful-- if we change the format of the matrix, the "mean_ptr < count_ptr"
|
||||
// statement below might become wrong.
|
||||
for (; mean_ptr < count_ptr; mean_ptr++, var_ptr++, feats_ptr++) {
|
||||
*mean_ptr += *feats_ptr * weight;
|
||||
*var_ptr += *feats_ptr * *feats_ptr * weight;
|
||||
}
|
||||
}
|
||||
|
||||
void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
|
||||
const VectorBase<BaseFloat> *weights,
|
||||
MatrixBase<double> *stats) {
|
||||
int32 num_frames = feats.NumRows();
|
||||
if (weights != NULL) {
|
||||
KALDI_ASSERT(weights->Dim() == num_frames);
|
||||
}
|
||||
for (int32 i = 0; i < num_frames; i++) {
|
||||
SubVector<BaseFloat> this_frame = feats.Row(i);
|
||||
BaseFloat weight = (weights == NULL ? 1.0 : (*weights)(i));
|
||||
if (weight != 0.0)
|
||||
AccCmvnStats(this_frame, weight, stats);
|
||||
}
|
||||
}
|
||||
|
||||
void ApplyCmvn(const MatrixBase<double> &stats,
|
||||
bool var_norm,
|
||||
MatrixBase<BaseFloat> *feats) {
|
||||
KALDI_ASSERT(feats != NULL);
|
||||
int32 dim = stats.NumCols() - 1;
|
||||
if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
|
||||
KALDI_ERR << "Dim mismatch: cmvn "
|
||||
<< stats.NumRows() << 'x' << stats.NumCols()
|
||||
<< ", feats " << feats->NumRows() << 'x' << feats->NumCols();
|
||||
}
|
||||
if (stats.NumRows() == 1 && var_norm)
|
||||
KALDI_ERR << "You requested variance normalization but no variance stats "
|
||||
<< "are supplied.";
|
||||
|
||||
double count = stats(0, dim);
|
||||
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
|
||||
// computing an offset and representing it as stats, we use a count of one.
|
||||
if (count < 1.0)
|
||||
KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
|
||||
<< "count = " << count;
|
||||
|
||||
if (!var_norm) {
|
||||
Vector<BaseFloat> offset(dim);
|
||||
SubVector<double> mean_stats(stats.RowData(0), dim);
|
||||
offset.AddVec(-1.0 / count, mean_stats);
|
||||
feats->AddVecToRows(1.0, offset);
|
||||
return;
|
||||
}
|
||||
// norm(0, d) = mean offset;
|
||||
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
|
||||
Matrix<BaseFloat> norm(2, dim);
|
||||
for (int32 d = 0; d < dim; d++) {
|
||||
double mean, offset, scale;
|
||||
mean = stats(0, d)/count;
|
||||
double var = (stats(1, d)/count) - mean*mean,
|
||||
floor = 1.0e-20;
|
||||
if (var < floor) {
|
||||
KALDI_WARN << "Flooring cepstral variance from " << var << " to "
|
||||
<< floor;
|
||||
var = floor;
|
||||
}
|
||||
scale = 1.0 / sqrt(var);
|
||||
if (scale != scale || 1/scale == 0.0)
|
||||
KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
|
||||
offset = -(mean*scale);
|
||||
norm(0, d) = offset;
|
||||
norm(1, d) = scale;
|
||||
}
|
||||
// Apply the normalization.
|
||||
feats->MulColsVec(norm.Row(1));
|
||||
feats->AddVecToRows(1.0, norm.Row(0));
|
||||
}
|
||||
|
||||
void ApplyCmvnReverse(const MatrixBase<double> &stats,
|
||||
bool var_norm,
|
||||
MatrixBase<BaseFloat> *feats) {
|
||||
KALDI_ASSERT(feats != NULL);
|
||||
int32 dim = stats.NumCols() - 1;
|
||||
if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
|
||||
KALDI_ERR << "Dim mismatch: cmvn "
|
||||
<< stats.NumRows() << 'x' << stats.NumCols()
|
||||
<< ", feats " << feats->NumRows() << 'x' << feats->NumCols();
|
||||
}
|
||||
if (stats.NumRows() == 1 && var_norm)
|
||||
KALDI_ERR << "You requested variance normalization but no variance stats "
|
||||
<< "are supplied.";
|
||||
|
||||
double count = stats(0, dim);
|
||||
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
|
||||
// computing an offset and representing it as stats, we use a count of one.
|
||||
if (count < 1.0)
|
||||
KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
|
||||
<< "count = " << count;
|
||||
|
||||
Matrix<BaseFloat> norm(2, dim); // norm(0, d) = mean offset
|
||||
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
|
||||
for (int32 d = 0; d < dim; d++) {
|
||||
double mean, offset, scale;
|
||||
mean = stats(0, d) / count;
|
||||
if (!var_norm) {
|
||||
scale = 1.0;
|
||||
offset = mean;
|
||||
} else {
|
||||
double var = (stats(1, d)/count) - mean*mean,
|
||||
floor = 1.0e-20;
|
||||
if (var < floor) {
|
||||
KALDI_WARN << "Flooring cepstral variance from " << var << " to "
|
||||
<< floor;
|
||||
var = floor;
|
||||
}
|
||||
// we aim to transform zero-mean, unit-variance input into data
|
||||
// with the given mean and variance.
|
||||
scale = sqrt(var);
|
||||
offset = mean;
|
||||
}
|
||||
norm(0, d) = offset;
|
||||
norm(1, d) = scale;
|
||||
}
|
||||
if (var_norm)
|
||||
feats->MulColsVec(norm.Row(1));
|
||||
feats->AddVecToRows(1.0, norm.Row(0));
|
||||
}
|
||||
|
||||
|
||||
void FakeStatsForSomeDims(const std::vector<int32> &dims,
|
||||
MatrixBase<double> *stats) {
|
||||
KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() > 1);
|
||||
int32 dim = stats->NumCols() - 1;
|
||||
double count = (*stats)(0, dim);
|
||||
for (size_t i = 0; i < dims.size(); i++) {
|
||||
int32 d = dims[i];
|
||||
KALDI_ASSERT(d >= 0 && d < dim);
|
||||
(*stats)(0, d) = 0.0;
|
||||
(*stats)(1, d) = count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,75 +0,0 @@
|
||||
// transform/cmvn.h
|
||||
|
||||
// Copyright 2009-2013 Microsoft Corporation
|
||||
// Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef KALDI_TRANSFORM_CMVN_H_
|
||||
#define KALDI_TRANSFORM_CMVN_H_
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "matrix/matrix-lib.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/// This function initializes the matrix to dimension 2 by (dim+1);
|
||||
/// 1st "dim" elements of 1st row are mean stats, 1st "dim" elements
|
||||
/// of 2nd row are var stats, last element of 1st row is count,
|
||||
/// last element of 2nd row is zero.
|
||||
void InitCmvnStats(int32 dim, Matrix<double> *stats);
|
||||
|
||||
/// Accumulation from a single frame (weighted).
|
||||
void AccCmvnStats(const VectorBase<BaseFloat> &feat,
|
||||
BaseFloat weight,
|
||||
MatrixBase<double> *stats);
|
||||
|
||||
/// Accumulation from a feature file (possibly weighted-- useful in excluding silence).
|
||||
void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
|
||||
const VectorBase<BaseFloat> *weights, // or NULL
|
||||
MatrixBase<double> *stats);
|
||||
|
||||
/// Apply cepstral mean and variance normalization to a matrix of features.
|
||||
/// If norm_vars == true, expects stats to be of dimension 2 by (dim+1), but
|
||||
/// if norm_vars == false, will accept stats of dimension 1 by (dim+1); these
|
||||
/// are produced by the balanced-cmvn code when it computes an offset and
|
||||
/// represents it as "fake stats".
|
||||
void ApplyCmvn(const MatrixBase<double> &stats,
|
||||
bool norm_vars,
|
||||
MatrixBase<BaseFloat> *feats);
|
||||
|
||||
/// This is as ApplyCmvn, but does so in the reverse sense, i.e. applies a transform
|
||||
/// that would take zero-mean, unit-variance input and turn it into output with the
|
||||
/// stats of "stats". This can be useful if you trained without CMVN but later want
|
||||
/// to correct a mismatch, so you would first apply CMVN and then do the "reverse"
|
||||
/// CMVN with the summed stats of your training data.
|
||||
void ApplyCmvnReverse(const MatrixBase<double> &stats,
|
||||
bool norm_vars,
|
||||
MatrixBase<BaseFloat> *feats);
|
||||
|
||||
|
||||
/// Modify the stats so that for some dimensions (specified in "dims"), we
|
||||
/// replace them with "fake" stats that have zero mean and unit variance; this
|
||||
/// is done to disable CMVN for those dimensions.
|
||||
void FakeStatsForSomeDims(const std::vector<int32> &dims,
|
||||
MatrixBase<double> *stats);
|
||||
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif // KALDI_TRANSFORM_CMVN_H_
|
@ -1,99 +0,0 @@
|
||||
// feat/feature-common-inl.h
|
||||
|
||||
// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_
|
||||
#define KALDI_FEAT_FEATURE_COMMON_INL_H_
|
||||
|
||||
#include "feat/resample.h"
|
||||
// Do not include this file directly. It is included by feat/feature-common.h
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template <class F>
|
||||
void OfflineFeatureTpl<F>::ComputeFeatures(
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat sample_freq,
|
||||
BaseFloat vtln_warp,
|
||||
Matrix<BaseFloat> *output) {
|
||||
KALDI_ASSERT(output != NULL);
|
||||
BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
|
||||
if (sample_freq == new_sample_freq) {
|
||||
Compute(wave, vtln_warp, output);
|
||||
} else {
|
||||
if (new_sample_freq < sample_freq &&
|
||||
! computer_.GetFrameOptions().allow_downsample)
|
||||
KALDI_ERR << "Waveform and config sample Frequency mismatch: "
|
||||
<< sample_freq << " .vs " << new_sample_freq
|
||||
<< " (use --allow-downsample=true to allow "
|
||||
<< " downsampling the waveform).";
|
||||
else if (new_sample_freq > sample_freq &&
|
||||
! computer_.GetFrameOptions().allow_upsample)
|
||||
KALDI_ERR << "Waveform and config sample Frequency mismatch: "
|
||||
<< sample_freq << " .vs " << new_sample_freq
|
||||
<< " (use --allow-upsample=true option to allow "
|
||||
<< " upsampling the waveform).";
|
||||
// Resample the waveform.
|
||||
Vector<BaseFloat> resampled_wave(wave);
|
||||
ResampleWaveform(sample_freq, wave,
|
||||
new_sample_freq, &resampled_wave);
|
||||
Compute(resampled_wave, vtln_warp, output);
|
||||
}
|
||||
}
|
||||
|
||||
template <class F>
|
||||
void OfflineFeatureTpl<F>::Compute(
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat vtln_warp,
|
||||
Matrix<BaseFloat> *output) {
|
||||
KALDI_ASSERT(output != NULL);
|
||||
int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()),
|
||||
cols_out = computer_.Dim();
|
||||
if (rows_out == 0) {
|
||||
output->Resize(0, 0);
|
||||
return;
|
||||
}
|
||||
output->Resize(rows_out, cols_out);
|
||||
Vector<BaseFloat> window; // windowed waveform.
|
||||
bool use_raw_log_energy = computer_.NeedRawLogEnergy();
|
||||
for (int32 r = 0; r < rows_out; r++) { // r is frame index.
|
||||
BaseFloat raw_log_energy = 0.0;
|
||||
ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
|
||||
feature_window_function_, &window,
|
||||
(use_raw_log_energy ? &raw_log_energy : NULL));
|
||||
|
||||
SubVector<BaseFloat> output_row(*output, r);
|
||||
computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
|
||||
}
|
||||
}
|
||||
|
||||
template <class F>
|
||||
void OfflineFeatureTpl<F>::Compute(
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat vtln_warp,
|
||||
Matrix<BaseFloat> *output) const {
|
||||
OfflineFeatureTpl<F> temp(*this);
|
||||
// call the non-const version of Compute() on a temporary copy of this object.
|
||||
// This is a workaround for const-ness that may sometimes be useful in
|
||||
// multi-threaded code, although it's not optimally efficient.
|
||||
temp.Compute(wave, vtln_warp, output);
|
||||
}
|
||||
|
||||
} // end namespace kaldi
|
||||
|
||||
#endif
|
@ -1,176 +0,0 @@
|
||||
// feat/feature-common.h
|
||||
|
||||
// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABILITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_COMMON_H_
|
||||
#define KALDI_FEAT_FEATURE_COMMON_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "feat/feature-window.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
|
||||
/// This class is only added for documentation, it is not intended to ever be
|
||||
/// used.
|
||||
struct ExampleFeatureComputerOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
// .. more would go here.
|
||||
};
|
||||
|
||||
/// This class is only added for documentation, it is not intended to ever be
|
||||
/// used. It documents the interface of the *Computer classes which wrap the
|
||||
/// low-level feature extraction. The template argument F of OfflineFeatureTpl must
|
||||
/// follow this interface. This interface is intended for features such as
|
||||
/// MFCCs and PLPs which can be computed frame by frame.
|
||||
class ExampleFeatureComputer {
|
||||
public:
|
||||
typedef ExampleFeatureComputerOptions Options;
|
||||
|
||||
/// Returns a reference to the frame-extraction options class, which
|
||||
/// will be part of our own options class.
|
||||
const FrameExtractionOptions &GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
/// Returns the feature dimension
|
||||
int32 Dim() const;
|
||||
|
||||
/// Returns true if this function may inspect the raw log-energy of the signal
|
||||
/// (before windowing and pre-emphasis); it's safe to always return true, but
|
||||
/// setting it to false enables an optimization.
|
||||
bool NeedRawLogEnergy() const { return true; }
|
||||
|
||||
/// constructor from options class; it should not store a reference or pointer
|
||||
/// to the options class but should copy it.
|
||||
explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts):
|
||||
opts_(opts) { }
|
||||
|
||||
/// Copy constructor; all of these classes must have one.
|
||||
ExampleFeatureComputer(const ExampleFeatureComputer &other);
|
||||
|
||||
/**
|
||||
Function that computes one frame of features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedRawLogEnergy().
|
||||
@param [in] vtln_warp The VTLN warping factor that the user wants
|
||||
to be applied when computing features for this utterance. Will
|
||||
normally be 1.0, meaning no warping is to be done. The value will
|
||||
be ignored for feature types that don't support VLTN, such as
|
||||
spectrogram features.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written.
|
||||
*/
|
||||
void Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature);
|
||||
|
||||
private:
|
||||
// disallow assignment.
|
||||
ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in);
|
||||
Options opts_;
|
||||
};
|
||||
|
||||
|
||||
/// This templated class is intended for offline feature extraction, i.e. where
|
||||
/// you have access to the entire signal at the start. It exists mainly to be
|
||||
/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for
|
||||
/// use in the offline case. In April 2016 we reorganized the online
|
||||
/// feature-computation code for greater modularity and to have correct support
|
||||
/// for the snip-edges=false option.
|
||||
template <class F>
|
||||
class OfflineFeatureTpl {
|
||||
public:
|
||||
typedef typename F::Options Options;
|
||||
|
||||
// Note: feature_window_function_ is the windowing function, which initialized
|
||||
// using the options class, that we cache at this level.
|
||||
OfflineFeatureTpl(const Options &opts):
|
||||
computer_(opts),
|
||||
feature_window_function_(computer_.GetFrameOptions()) { }
|
||||
|
||||
// Internal (and back-compatibility) interface for computing features, which
|
||||
// requires that the user has already checked that the sampling frequency
|
||||
// of the waveform is equal to the sampling frequency specified in
|
||||
// the frame-extraction options.
|
||||
void Compute(const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat vtln_warp,
|
||||
Matrix<BaseFloat> *output);
|
||||
|
||||
// This const version of Compute() is a wrapper that
|
||||
// calls the non-const version on a temporary object.
|
||||
// It's less efficient than the non-const version.
|
||||
void Compute(const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat vtln_warp,
|
||||
Matrix<BaseFloat> *output) const;
|
||||
|
||||
/**
|
||||
Computes the features for one file (one sequence of features).
|
||||
This is the newer interface where you specify the sample frequency
|
||||
of the input waveform.
|
||||
@param [in] wave The input waveform
|
||||
@param [in] sample_freq The sampling frequency with which
|
||||
'wave' was sampled.
|
||||
if sample_freq is higher than the frequency
|
||||
specified in the config, we will downsample
|
||||
the waveform, but if lower, it's an error.
|
||||
@param [in] vtln_warp The VTLN warping factor (will normally
|
||||
be 1.0)
|
||||
@param [out] output The matrix of features, where the row-index
|
||||
is the frame index.
|
||||
*/
|
||||
void ComputeFeatures(const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat sample_freq,
|
||||
BaseFloat vtln_warp,
|
||||
Matrix<BaseFloat> *output);
|
||||
|
||||
int32 Dim() const { return computer_.Dim(); }
|
||||
|
||||
// Copy constructor.
|
||||
OfflineFeatureTpl(const OfflineFeatureTpl<F> &other):
|
||||
computer_(other.computer_),
|
||||
feature_window_function_(other.feature_window_function_) { }
|
||||
private:
|
||||
// Disallow assignment.
|
||||
OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
|
||||
|
||||
F computer_;
|
||||
FeatureWindowFunction feature_window_function_;
|
||||
};
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#include "feat/feature-common-inl.h"
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_COMMON_H_
|
@ -1,125 +0,0 @@
|
||||
// feat/feature-fbank.cc
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "feat/feature-fbank.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
FbankComputer::FbankComputer(const FbankOptions &opts):
|
||||
opts_(opts), srfft_(NULL) {
|
||||
if (opts.energy_floor > 0.0)
|
||||
log_energy_floor_ = Log(opts.energy_floor);
|
||||
|
||||
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
|
||||
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
|
||||
|
||||
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
|
||||
// [note: this call caches it.]
|
||||
GetMelBanks(1.0);
|
||||
}
|
||||
|
||||
FbankComputer::FbankComputer(const FbankComputer &other):
|
||||
opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
|
||||
mel_banks_(other.mel_banks_), srfft_(NULL) {
|
||||
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
|
||||
iter != mel_banks_.end();
|
||||
++iter)
|
||||
iter->second = new MelBanks(*(iter->second));
|
||||
if (other.srfft_)
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
|
||||
}
|
||||
|
||||
FbankComputer::~FbankComputer() {
|
||||
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
|
||||
iter != mel_banks_.end(); ++iter)
|
||||
delete iter->second;
|
||||
delete srfft_;
|
||||
}
|
||||
|
||||
const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
|
||||
MelBanks *this_mel_banks = NULL;
|
||||
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
|
||||
if (iter == mel_banks_.end()) {
|
||||
this_mel_banks = new MelBanks(opts_.mel_opts,
|
||||
opts_.frame_opts,
|
||||
vtln_warp);
|
||||
mel_banks_[vtln_warp] = this_mel_banks;
|
||||
} else {
|
||||
this_mel_banks = iter->second;
|
||||
}
|
||||
return this_mel_banks;
|
||||
}
|
||||
|
||||
void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature) {
|
||||
|
||||
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
|
||||
|
||||
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
|
||||
feature->Dim() == this->Dim());
|
||||
|
||||
|
||||
// Compute energy after window function (not the raw one).
|
||||
if (opts_.use_energy && !opts_.raw_energy)
|
||||
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
|
||||
std::numeric_limits<float>::epsilon()));
|
||||
|
||||
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
|
||||
srfft_->Compute(signal_frame->Data(), true);
|
||||
else // An alternative algorithm that works for non-powers-of-two.
|
||||
RealFft(signal_frame, true);
|
||||
|
||||
// Convert the FFT into a power spectrum.
|
||||
ComputePowerSpectrum(signal_frame);
|
||||
SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
|
||||
signal_frame->Dim() / 2 + 1);
|
||||
|
||||
// Use magnitude instead of power if requested.
|
||||
if (!opts_.use_power)
|
||||
power_spectrum.ApplyPow(0.5);
|
||||
|
||||
int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
|
||||
SubVector<BaseFloat> mel_energies(*feature,
|
||||
mel_offset,
|
||||
opts_.mel_opts.num_bins);
|
||||
|
||||
// Sum with mel fiterbanks over the power spectrum
|
||||
mel_banks.Compute(power_spectrum, &mel_energies);
|
||||
if (opts_.use_log_fbank) {
|
||||
// Avoid log of zero (which should be prevented anyway by dithering).
|
||||
mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
|
||||
mel_energies.ApplyLog(); // take the log.
|
||||
}
|
||||
|
||||
// Copy energy as first value (or the last, if htk_compat == true).
|
||||
if (opts_.use_energy) {
|
||||
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
|
||||
signal_raw_log_energy = log_energy_floor_;
|
||||
}
|
||||
int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
|
||||
(*feature)(energy_index) = signal_raw_log_energy;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
@ -1,149 +0,0 @@
|
||||
// feat/feature-fbank.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_FBANK_H_
|
||||
#define KALDI_FEAT_FEATURE_FBANK_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "feat/feature-common.h"
|
||||
#include "feat/feature-functions.h"
|
||||
#include "feat/feature-window.h"
|
||||
#include "feat/mel-computations.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
/// FbankOptions contains basic options for computing filterbank features.
|
||||
/// It only includes things that can be done in a "stateless" way, i.e.
|
||||
/// it does not include energy max-normalization.
|
||||
/// It does not include delta computation.
|
||||
struct FbankOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
MelBanksOptions mel_opts;
|
||||
bool use_energy; // append an extra dimension with energy to the filter banks
|
||||
BaseFloat energy_floor;
|
||||
bool raw_energy; // If true, compute energy before preemphasis and windowing
|
||||
bool htk_compat; // If true, put energy last (if using energy)
|
||||
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
|
||||
bool use_power; // if true (default), use power in filterbank analysis, else magnitude.
|
||||
|
||||
FbankOptions(): mel_opts(23),
|
||||
// defaults the #mel-banks to 23 for the FBANK computations.
|
||||
// this seems to be common for 16khz-sampled data,
|
||||
// but for 8khz-sampled data, 15 may be better.
|
||||
use_energy(false),
|
||||
energy_floor(0.0),
|
||||
raw_energy(true),
|
||||
htk_compat(false),
|
||||
use_log_fbank(true),
|
||||
use_power(true) {}
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
frame_opts.Register(opts);
|
||||
mel_opts.Register(opts);
|
||||
opts->Register("use-energy", &use_energy,
|
||||
"Add an extra dimension with energy to the FBANK output.");
|
||||
opts->Register("energy-floor", &energy_floor,
|
||||
"Floor on energy (absolute, not relative) in FBANK computation. "
|
||||
"Only makes a difference if --use-energy=true; only necessary if "
|
||||
"--dither=0.0. Suggested values: 0.1 or 1.0");
|
||||
opts->Register("raw-energy", &raw_energy,
|
||||
"If true, compute energy before preemphasis and windowing");
|
||||
opts->Register("htk-compat", &htk_compat, "If true, put energy last. "
|
||||
"Warning: not sufficient to get HTK compatible features (need "
|
||||
"to change other parameters).");
|
||||
opts->Register("use-log-fbank", &use_log_fbank,
|
||||
"If true, produce log-filterbank, else produce linear.");
|
||||
opts->Register("use-power", &use_power,
|
||||
"If true, use power, else use magnitude.");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Class for computing mel-filterbank features; see \ref feat_mfcc for more
|
||||
/// information.
|
||||
class FbankComputer {
|
||||
public:
|
||||
typedef FbankOptions Options;
|
||||
|
||||
explicit FbankComputer(const FbankOptions &opts);
|
||||
FbankComputer(const FbankComputer &other);
|
||||
|
||||
int32 Dim() const {
|
||||
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
|
||||
}
|
||||
|
||||
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
|
||||
|
||||
const FrameExtractionOptions &GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
/**
|
||||
Function that computes one frame of features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedsRawLogEnergy().
|
||||
@param [in] vtln_warp The VTLN warping factor that the user wants
|
||||
to be applied when computing features for this utterance. Will
|
||||
normally be 1.0, meaning no warping is to be done. The value will
|
||||
be ignored for feature types that don't support VLTN, such as
|
||||
spectrogram features.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written.
|
||||
*/
|
||||
void Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature);
|
||||
|
||||
~FbankComputer();
|
||||
|
||||
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
|
||||
private:
|
||||
|
||||
|
||||
FbankOptions opts_;
|
||||
BaseFloat log_energy_floor_;
|
||||
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
|
||||
SplitRadixRealFft<BaseFloat> *srfft_;
|
||||
// Disallow assignment.
|
||||
FbankComputer &operator =(const FbankComputer &other);
|
||||
};
|
||||
|
||||
typedef OfflineFeatureTpl<FbankComputer> Fbank;
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_FBANK_H_
|
@ -1,362 +0,0 @@
|
||||
// feat/feature-functions.cc
|
||||
|
||||
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "feat/feature-functions.h"
|
||||
#include "matrix/matrix-functions.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
|
||||
int32 dim = waveform->Dim();
|
||||
|
||||
// no, letting it be non-power-of-two for now.
|
||||
// KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code
|
||||
// does not require this (dan) but this is better in case we use different code [dan].
|
||||
|
||||
// RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here,
|
||||
// as we just want power spectrum.
|
||||
|
||||
// now we have in waveform, first half of complex spectrum
|
||||
// it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
|
||||
int32 half_dim = dim/2;
|
||||
BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
|
||||
last_energy = (*waveform)(1) * (*waveform)(1); // handle this special case
|
||||
for (int32 i = 1; i < half_dim; i++) {
|
||||
BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1);
|
||||
(*waveform)(i) = real*real + im*im;
|
||||
}
|
||||
(*waveform)(0) = first_energy;
|
||||
(*waveform)(half_dim) = last_energy; // Will actually never be used, and anyway
|
||||
// if the signal has been bandlimited sensibly this should be zero.
|
||||
}
|
||||
|
||||
|
||||
DeltaFeatures::DeltaFeatures(const DeltaFeaturesOptions &opts): opts_(opts) {
|
||||
KALDI_ASSERT(opts.order >= 0 && opts.order < 1000); // just make sure we don't get binary junk.
|
||||
// opts will normally be 2 or 3.
|
||||
KALDI_ASSERT(opts.window > 0 && opts.window < 1000); // again, basic sanity check.
|
||||
// normally the window size will be two.
|
||||
|
||||
scales_.resize(opts.order+1);
|
||||
scales_[0].Resize(1);
|
||||
scales_[0](0) = 1.0; // trivial window for 0th order delta [i.e. baseline feats]
|
||||
|
||||
for (int32 i = 1; i <= opts.order; i++) {
|
||||
Vector<BaseFloat> &prev_scales = scales_[i-1],
|
||||
&cur_scales = scales_[i];
|
||||
int32 window = opts.window; // this code is designed to still
|
||||
// work if instead we later make it an array and do opts.window[i-1],
|
||||
// or something like that. "window" is a parameter specifying delta-window
|
||||
// width which is actually 2*window + 1.
|
||||
KALDI_ASSERT(window != 0);
|
||||
int32 prev_offset = (static_cast<int32>(prev_scales.Dim()-1))/2,
|
||||
cur_offset = prev_offset + window;
|
||||
cur_scales.Resize(prev_scales.Dim() + 2*window); // also zeros it.
|
||||
|
||||
BaseFloat normalizer = 0.0;
|
||||
for (int32 j = -window; j <= window; j++) {
|
||||
normalizer += j*j;
|
||||
for (int32 k = -prev_offset; k <= prev_offset; k++) {
|
||||
cur_scales(j+k+cur_offset) +=
|
||||
static_cast<BaseFloat>(j) * prev_scales(k+prev_offset);
|
||||
}
|
||||
}
|
||||
cur_scales.Scale(1.0 / normalizer);
|
||||
}
|
||||
}
|
||||
|
||||
void DeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
|
||||
int32 frame,
|
||||
VectorBase<BaseFloat> *output_frame) const {
|
||||
KALDI_ASSERT(frame < input_feats.NumRows());
|
||||
int32 num_frames = input_feats.NumRows(),
|
||||
feat_dim = input_feats.NumCols();
|
||||
KALDI_ASSERT(static_cast<int32>(output_frame->Dim()) == feat_dim * (opts_.order+1));
|
||||
output_frame->SetZero();
|
||||
for (int32 i = 0; i <= opts_.order; i++) {
|
||||
const Vector<BaseFloat> &scales = scales_[i];
|
||||
int32 max_offset = (scales.Dim() - 1) / 2;
|
||||
SubVector<BaseFloat> output(*output_frame, i*feat_dim, feat_dim);
|
||||
for (int32 j = -max_offset; j <= max_offset; j++) {
|
||||
// if asked to read
|
||||
int32 offset_frame = frame + j;
|
||||
if (offset_frame < 0) offset_frame = 0;
|
||||
else if (offset_frame >= num_frames)
|
||||
offset_frame = num_frames - 1;
|
||||
BaseFloat scale = scales(j + max_offset);
|
||||
if (scale != 0.0)
|
||||
output.AddVec(scale, input_feats.Row(offset_frame));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ShiftedDeltaFeatures::ShiftedDeltaFeatures(
|
||||
const ShiftedDeltaFeaturesOptions &opts): opts_(opts) {
|
||||
KALDI_ASSERT(opts.window > 0 && opts.window < 1000);
|
||||
|
||||
// Default window is 1.
|
||||
int32 window = opts.window;
|
||||
KALDI_ASSERT(window != 0);
|
||||
scales_.Resize(1 + 2*window); // also zeros it.
|
||||
BaseFloat normalizer = 0.0;
|
||||
for (int32 j = -window; j <= window; j++) {
|
||||
normalizer += j*j;
|
||||
scales_(j + window) += static_cast<BaseFloat>(j);
|
||||
}
|
||||
scales_.Scale(1.0 / normalizer);
|
||||
}
|
||||
|
||||
void ShiftedDeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
|
||||
int32 frame,
|
||||
SubVector<BaseFloat> *output_frame) const {
|
||||
KALDI_ASSERT(frame < input_feats.NumRows());
|
||||
int32 num_frames = input_feats.NumRows(),
|
||||
feat_dim = input_feats.NumCols();
|
||||
KALDI_ASSERT(static_cast<int32>(output_frame->Dim())
|
||||
== feat_dim * (opts_.num_blocks + 1));
|
||||
output_frame->SetZero();
|
||||
|
||||
// The original features
|
||||
SubVector<BaseFloat> output(*output_frame, 0, feat_dim);
|
||||
output.AddVec(1.0, input_feats.Row(frame));
|
||||
|
||||
// Concatenate the delta-blocks. Each block is block_shift
|
||||
// (usually 3) frames apart.
|
||||
for (int32 i = 0; i < opts_.num_blocks; i++) {
|
||||
int32 max_offset = (scales_.Dim() - 1) / 2;
|
||||
SubVector<BaseFloat> output(*output_frame, (i + 1) * feat_dim, feat_dim);
|
||||
for (int32 j = -max_offset; j <= max_offset; j++) {
|
||||
int32 offset_frame = frame + j + i * opts_.block_shift;
|
||||
if (offset_frame < 0) offset_frame = 0;
|
||||
else if (offset_frame >= num_frames)
|
||||
offset_frame = num_frames - 1;
|
||||
BaseFloat scale = scales_(j + max_offset);
|
||||
if (scale != 0.0)
|
||||
output.AddVec(scale, input_feats.Row(offset_frame));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
|
||||
const MatrixBase<BaseFloat> &input_features,
|
||||
Matrix<BaseFloat> *output_features) {
|
||||
output_features->Resize(input_features.NumRows(),
|
||||
input_features.NumCols()
|
||||
*(delta_opts.order + 1));
|
||||
DeltaFeatures delta(delta_opts);
|
||||
for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
|
||||
SubVector<BaseFloat> row(*output_features, r);
|
||||
delta.Process(input_features, r, &row);
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
|
||||
const MatrixBase<BaseFloat> &input_features,
|
||||
Matrix<BaseFloat> *output_features) {
|
||||
output_features->Resize(input_features.NumRows(),
|
||||
input_features.NumCols()
|
||||
* (delta_opts.num_blocks + 1));
|
||||
ShiftedDeltaFeatures delta(delta_opts);
|
||||
|
||||
for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
|
||||
SubVector<BaseFloat> row(*output_features, r);
|
||||
delta.Process(input_features, r, &row);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
|
||||
BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
|
||||
BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
|
||||
mat_out->Resize(n_bases, dimension);
|
||||
for (int32 i = 0; i < n_bases; i++) {
|
||||
(*mat_out)(i, 0) = 1.0 * scale;
|
||||
BaseFloat i_fl = static_cast<BaseFloat>(i);
|
||||
for (int32 j = 1; j < dimension - 1; j++) {
|
||||
BaseFloat j_fl = static_cast<BaseFloat>(j);
|
||||
(*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl);
|
||||
}
|
||||
|
||||
(*mat_out)(i, dimension -1)
|
||||
= scale * cos(angle * i_fl * static_cast<BaseFloat>(dimension-1));
|
||||
}
|
||||
}
|
||||
|
||||
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
|
||||
int32 left_context,
|
||||
int32 right_context,
|
||||
Matrix<BaseFloat> *output_features) {
|
||||
int32 T = input_features.NumRows(), D = input_features.NumCols();
|
||||
if (T == 0 || D == 0)
|
||||
KALDI_ERR << "SpliceFrames: empty input";
|
||||
KALDI_ASSERT(left_context >= 0 && right_context >= 0);
|
||||
int32 N = 1 + left_context + right_context;
|
||||
output_features->Resize(T, D*N);
|
||||
for (int32 t = 0; t < T; t++) {
|
||||
SubVector<BaseFloat> dst_row(*output_features, t);
|
||||
for (int32 j = 0; j < N; j++) {
|
||||
int32 t2 = t + j - left_context;
|
||||
if (t2 < 0) t2 = 0;
|
||||
if (t2 >= T) t2 = T-1;
|
||||
SubVector<BaseFloat> dst(dst_row, j*D, D),
|
||||
src(input_features, t2);
|
||||
dst.CopyFromVec(src);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
|
||||
Matrix<BaseFloat> *output_features) {
|
||||
int32 T = input_features.NumRows(), D = input_features.NumCols();
|
||||
if (T == 0 || D == 0)
|
||||
KALDI_ERR << "ReverseFrames: empty input";
|
||||
output_features->Resize(T, D);
|
||||
for (int32 t = 0; t < T; t++) {
|
||||
SubVector<BaseFloat> dst_row(*output_features, t);
|
||||
SubVector<BaseFloat> src_row(input_features, T-1-t);
|
||||
dst_row.CopyFromVec(src_row);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void SlidingWindowCmnOptions::Check() const {
|
||||
KALDI_ASSERT(cmn_window > 0);
|
||||
if (center)
|
||||
KALDI_ASSERT(min_window > 0 && min_window <= cmn_window);
|
||||
// else ignored so value doesn't matter.
|
||||
}
|
||||
|
||||
// Internal version of SlidingWindowCmn with double-precision arguments.
|
||||
void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts,
|
||||
const MatrixBase<double> &input,
|
||||
MatrixBase<double> *output) {
|
||||
opts.Check();
|
||||
int32 num_frames = input.NumRows(), dim = input.NumCols(),
|
||||
last_window_start = -1, last_window_end = -1,
|
||||
warning_count = 0;
|
||||
Vector<double> cur_sum(dim), cur_sumsq(dim);
|
||||
|
||||
for (int32 t = 0; t < num_frames; t++) {
|
||||
int32 window_start, window_end; // note: window_end will be one
|
||||
// past the end of the window we use for normalization.
|
||||
if (opts.center) {
|
||||
window_start = t - (opts.cmn_window / 2);
|
||||
window_end = window_start + opts.cmn_window;
|
||||
} else {
|
||||
window_start = t - opts.cmn_window;
|
||||
window_end = t + 1;
|
||||
}
|
||||
if (window_start < 0) { // shift window right if starts <0.
|
||||
window_end -= window_start;
|
||||
window_start = 0; // or: window_start -= window_start
|
||||
}
|
||||
if (!opts.center) {
|
||||
if (window_end > t)
|
||||
window_end = std::max(t + 1, opts.min_window);
|
||||
}
|
||||
if (window_end > num_frames) {
|
||||
window_start -= (window_end - num_frames);
|
||||
window_end = num_frames;
|
||||
if (window_start < 0) window_start = 0;
|
||||
}
|
||||
if (last_window_start == -1) {
|
||||
SubMatrix<double> input_part(input,
|
||||
window_start, window_end - window_start,
|
||||
0, dim);
|
||||
cur_sum.AddRowSumMat(1.0, input_part , 0.0);
|
||||
if (opts.normalize_variance)
|
||||
cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0);
|
||||
} else {
|
||||
if (window_start > last_window_start) {
|
||||
KALDI_ASSERT(window_start == last_window_start + 1);
|
||||
SubVector<double> frame_to_remove(input, last_window_start);
|
||||
cur_sum.AddVec(-1.0, frame_to_remove);
|
||||
if (opts.normalize_variance)
|
||||
cur_sumsq.AddVec2(-1.0, frame_to_remove);
|
||||
}
|
||||
if (window_end > last_window_end) {
|
||||
KALDI_ASSERT(window_end == last_window_end + 1);
|
||||
SubVector<double> frame_to_add(input, last_window_end);
|
||||
cur_sum.AddVec(1.0, frame_to_add);
|
||||
if (opts.normalize_variance)
|
||||
cur_sumsq.AddVec2(1.0, frame_to_add);
|
||||
}
|
||||
}
|
||||
int32 window_frames = window_end - window_start;
|
||||
last_window_start = window_start;
|
||||
last_window_end = window_end;
|
||||
|
||||
KALDI_ASSERT(window_frames > 0);
|
||||
SubVector<double> input_frame(input, t),
|
||||
output_frame(*output, t);
|
||||
output_frame.CopyFromVec(input_frame);
|
||||
output_frame.AddVec(-1.0 / window_frames, cur_sum);
|
||||
|
||||
if (opts.normalize_variance) {
|
||||
if (window_frames == 1) {
|
||||
output_frame.Set(0.0);
|
||||
} else {
|
||||
Vector<double> variance(cur_sumsq);
|
||||
variance.Scale(1.0 / window_frames);
|
||||
variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum);
|
||||
// now "variance" is the variance of the features in the window,
|
||||
// around their own mean.
|
||||
int32 num_floored;
|
||||
variance.ApplyFloor(1.0e-10, &num_floored);
|
||||
if (num_floored > 0 && num_frames > 1) {
|
||||
if (opts.max_warnings == warning_count) {
|
||||
KALDI_WARN << "Suppressing the remaining variance flooring "
|
||||
<< "warnings. Run program with --max-warnings=-1 to "
|
||||
<< "see all warnings.";
|
||||
}
|
||||
// If opts.max_warnings is a negative number, we won't restrict the
|
||||
// number of times that the warning is printed out.
|
||||
else if (opts.max_warnings < 0
|
||||
|| opts.max_warnings > warning_count) {
|
||||
KALDI_WARN << "Flooring when normalizing variance, floored "
|
||||
<< num_floored << " elements; num-frames was "
|
||||
<< window_frames;
|
||||
}
|
||||
warning_count++;
|
||||
}
|
||||
variance.ApplyPow(-0.5); // get inverse standard deviation.
|
||||
output_frame.MulElements(variance);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
|
||||
const MatrixBase<BaseFloat> &input,
|
||||
MatrixBase<BaseFloat> *output) {
|
||||
KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0);
|
||||
Matrix<double> input_dbl(input), output_dbl(input.NumRows(), input.NumCols());
|
||||
// call double-precision version
|
||||
SlidingWindowCmnInternal(opts, input_dbl, &output_dbl);
|
||||
output->CopyFromMat(output_dbl);
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,204 +0,0 @@
|
||||
// feat/feature-functions.h
|
||||
|
||||
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
|
||||
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
// 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_
|
||||
#define KALDI_FEAT_FEATURE_FUNCTIONS_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "matrix/matrix-lib.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "base/kaldi-error.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
|
||||
// functions in matrix/matrix-functions.h), and converts it into
|
||||
// a power spectrum. If the complex FFT is a vector of size n (representing
|
||||
// half the complex FFT of a real signal of size n, as described there),
|
||||
// this function computes in the first (n/2) + 1 elements of it, the
|
||||
// energies of the fft bins from zero to the Nyquist frequency. Contents of the
|
||||
// remaining (n/2) - 1 elements are undefined at output.
|
||||
void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
|
||||
|
||||
|
||||
struct DeltaFeaturesOptions {
|
||||
int32 order;
|
||||
int32 window; // e.g. 2; controls window size (window size is 2*window + 1)
|
||||
// the behavior at the edges is to replicate the first or last frame.
|
||||
// this is not configurable.
|
||||
|
||||
DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
|
||||
order(order), window(window) { }
|
||||
void Register(OptionsItf *opts) {
|
||||
opts->Register("delta-order", &order, "Order of delta computation");
|
||||
opts->Register("delta-window", &window,
|
||||
"Parameter controlling window for delta computation (actual window"
|
||||
" size for each delta order is 1 + 2*delta-window-size)");
|
||||
}
|
||||
};
|
||||
|
||||
class DeltaFeatures {
|
||||
public:
|
||||
// This class provides a low-level function to compute delta features.
|
||||
// The function takes as input a matrix of features and a frame index
|
||||
// that it should compute the deltas on. It puts its output in an object
|
||||
// of type VectorBase, of size (original-feature-dimension) * (opts.order+1).
|
||||
// This is not the most efficient way to do the computation, but it's
|
||||
// state-free and thus easier to understand
|
||||
|
||||
explicit DeltaFeatures(const DeltaFeaturesOptions &opts);
|
||||
|
||||
void Process(const MatrixBase<BaseFloat> &input_feats,
|
||||
int32 frame,
|
||||
VectorBase<BaseFloat> *output_frame) const;
|
||||
private:
|
||||
DeltaFeaturesOptions opts_;
|
||||
std::vector<Vector<BaseFloat> > scales_; // a scaling window for each
|
||||
// of the orders, including zero: multiply the features for each
|
||||
// dimension by this window.
|
||||
};
|
||||
|
||||
struct ShiftedDeltaFeaturesOptions {
|
||||
int32 window, // The time delay and advance
|
||||
num_blocks,
|
||||
block_shift; // Distance between consecutive blocks
|
||||
|
||||
ShiftedDeltaFeaturesOptions():
|
||||
window(1), num_blocks(7), block_shift(3) { }
|
||||
void Register(OptionsItf *opts) {
|
||||
opts->Register("delta-window", &window, "Size of delta advance and delay.");
|
||||
opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance"
|
||||
" of each frame to be concatenated");
|
||||
opts->Register("block-shift", &block_shift, "Distance between each block");
|
||||
}
|
||||
};
|
||||
|
||||
class ShiftedDeltaFeatures {
|
||||
public:
|
||||
// This class provides a low-level function to compute shifted
|
||||
// delta cesptra (SDC).
|
||||
// The function takes as input a matrix of features and a frame index
|
||||
// that it should compute the deltas on. It puts its output in an object
|
||||
// of type VectorBase, of size original-feature-dimension + (1 * num_blocks).
|
||||
|
||||
explicit ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts);
|
||||
|
||||
void Process(const MatrixBase<BaseFloat> &input_feats,
|
||||
int32 frame,
|
||||
SubVector<BaseFloat> *output_frame) const;
|
||||
private:
|
||||
ShiftedDeltaFeaturesOptions opts_;
|
||||
Vector<BaseFloat> scales_; // a scaling window for each
|
||||
|
||||
};
|
||||
|
||||
// ComputeDeltas is a convenience function that computes deltas on a feature
|
||||
// file. If you want to deal with features coming in bit by bit you would have
|
||||
// to use the DeltaFeatures class directly, and do the computation frame by
|
||||
// frame. Later we will have to come up with a nice mechanism to do this for
|
||||
// features coming in.
|
||||
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
|
||||
const MatrixBase<BaseFloat> &input_features,
|
||||
Matrix<BaseFloat> *output_features);
|
||||
|
||||
// ComputeShiftedDeltas computes deltas from a feature file by applying
|
||||
// ShiftedDeltaFeatures over the frames. This function is provided for
|
||||
// convenience, however, ShiftedDeltaFeatures can be used directly.
|
||||
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
|
||||
const MatrixBase<BaseFloat> &input_features,
|
||||
Matrix<BaseFloat> *output_features);
|
||||
|
||||
// SpliceFrames will normally be used together with LDA.
|
||||
// It splices frames together to make a window. At the
|
||||
// start and end of an utterance, it duplicates the first
|
||||
// and last frames.
|
||||
// Will throw if input features are empty.
|
||||
// left_context and right_context must be nonnegative.
|
||||
// these both represent a number of frames (e.g. 4, 4 is
|
||||
// a good choice).
|
||||
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
|
||||
int32 left_context,
|
||||
int32 right_context,
|
||||
Matrix<BaseFloat> *output_features);
|
||||
|
||||
// ReverseFrames reverses the frames in time (used for backwards decoding)
|
||||
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
|
||||
Matrix<BaseFloat> *output_features);
|
||||
|
||||
|
||||
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
|
||||
|
||||
|
||||
// This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which
|
||||
// is online CMN with no latency, for online speech recognition.
|
||||
struct SlidingWindowCmnOptions {
|
||||
int32 cmn_window;
|
||||
int32 min_window;
|
||||
int32 max_warnings;
|
||||
bool normalize_variance;
|
||||
bool center;
|
||||
|
||||
SlidingWindowCmnOptions():
|
||||
cmn_window(600),
|
||||
min_window(100),
|
||||
max_warnings(5),
|
||||
normalize_variance(false),
|
||||
center(false) { }
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
opts->Register("cmn-window", &cmn_window, "Window in frames for running "
|
||||
"average CMN computation");
|
||||
opts->Register("min-cmn-window", &min_window, "Minimum CMN window "
|
||||
"used at start of decoding (adds latency only at start). "
|
||||
"Only applicable if center == false, ignored if center==true");
|
||||
opts->Register("max-warnings", &max_warnings, "Maximum warnings to report "
|
||||
"per utterance. 0 to disable, -1 to show all.");
|
||||
opts->Register("norm-vars", &normalize_variance, "If true, normalize "
|
||||
"variance to one."); // naming this as in apply-cmvn.cc
|
||||
opts->Register("center", ¢er, "If true, use a window centered on the "
|
||||
"current frame (to the extent possible, modulo end effects). "
|
||||
"If false, window is to the left.");
|
||||
}
|
||||
void Check() const;
|
||||
};
|
||||
|
||||
|
||||
/// Applies sliding-window cepstral mean and/or variance normalization. See the
|
||||
/// strings registering the options in the options class for information on how
|
||||
/// this works and what the options are. input and output must have the same
|
||||
/// dimension.
|
||||
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
|
||||
const MatrixBase<BaseFloat> &input,
|
||||
MatrixBase<BaseFloat> *output);
|
||||
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_FUNCTIONS_H_
|
@ -1,157 +0,0 @@
|
||||
// feat/feature-mfcc.cc
|
||||
|
||||
// Copyright 2009-2011 Karel Vesely; Petr Motlicek
|
||||
// 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "feat/feature-mfcc.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature) {
|
||||
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
|
||||
feature->Dim() == this->Dim());
|
||||
|
||||
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
|
||||
|
||||
if (opts_.use_energy && !opts_.raw_energy)
|
||||
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
|
||||
std::numeric_limits<float>::epsilon()));
|
||||
|
||||
if (srfft_ != NULL) // Compute FFT using the split-radix algorithm.
|
||||
srfft_->Compute(signal_frame->Data(), true);
|
||||
else // An alternative algorithm that works for non-powers-of-two.
|
||||
RealFft(signal_frame, true);
|
||||
|
||||
// Convert the FFT into a power spectrum.
|
||||
ComputePowerSpectrum(signal_frame);
|
||||
SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
|
||||
signal_frame->Dim() / 2 + 1);
|
||||
|
||||
mel_banks.Compute(power_spectrum, &mel_energies_);
|
||||
|
||||
// avoid log of zero (which should be prevented anyway by dithering).
|
||||
mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
|
||||
mel_energies_.ApplyLog(); // take the log.
|
||||
|
||||
feature->SetZero(); // in case there were NaNs.
|
||||
// feature = dct_matrix_ * mel_energies [which now have log]
|
||||
feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
|
||||
|
||||
if (opts_.cepstral_lifter != 0.0)
|
||||
feature->MulElements(lifter_coeffs_);
|
||||
|
||||
if (opts_.use_energy) {
|
||||
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
|
||||
signal_raw_log_energy = log_energy_floor_;
|
||||
(*feature)(0) = signal_raw_log_energy;
|
||||
}
|
||||
|
||||
if (opts_.htk_compat) {
|
||||
BaseFloat energy = (*feature)(0);
|
||||
for (int32 i = 0; i < opts_.num_ceps - 1; i++)
|
||||
(*feature)(i) = (*feature)(i+1);
|
||||
if (!opts_.use_energy)
|
||||
energy *= M_SQRT2; // scale on C0 (actually removing a scale
|
||||
// we previously added that's part of one common definition of
|
||||
// the cosine transform.)
|
||||
(*feature)(opts_.num_ceps - 1) = energy;
|
||||
}
|
||||
}
|
||||
|
||||
MfccComputer::MfccComputer(const MfccOptions &opts):
|
||||
opts_(opts), srfft_(NULL),
|
||||
mel_energies_(opts.mel_opts.num_bins) {
|
||||
|
||||
int32 num_bins = opts.mel_opts.num_bins;
|
||||
if (opts.num_ceps > num_bins)
|
||||
KALDI_ERR << "num-ceps cannot be larger than num-mel-bins."
|
||||
<< " It should be smaller or equal. You provided num-ceps: "
|
||||
<< opts.num_ceps << " and num-mel-bins: "
|
||||
<< num_bins;
|
||||
|
||||
Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
|
||||
ComputeDctMatrix(&dct_matrix);
|
||||
// Note that we include zeroth dct in either case. If using the
|
||||
// energy we replace this with the energy. This means a different
|
||||
// ordering of features than HTK.
|
||||
SubMatrix<BaseFloat> dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins);
|
||||
dct_matrix_.Resize(opts.num_ceps, num_bins);
|
||||
dct_matrix_.CopyFromMat(dct_rows); // subset of rows.
|
||||
if (opts.cepstral_lifter != 0.0) {
|
||||
lifter_coeffs_.Resize(opts.num_ceps);
|
||||
ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
|
||||
}
|
||||
if (opts.energy_floor > 0.0)
|
||||
log_energy_floor_ = Log(opts.energy_floor);
|
||||
|
||||
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
|
||||
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
|
||||
|
||||
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
|
||||
// [note: this call caches it.]
|
||||
GetMelBanks(1.0);
|
||||
}
|
||||
|
||||
MfccComputer::MfccComputer(const MfccComputer &other):
|
||||
opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
|
||||
dct_matrix_(other.dct_matrix_),
|
||||
log_energy_floor_(other.log_energy_floor_),
|
||||
mel_banks_(other.mel_banks_),
|
||||
srfft_(NULL),
|
||||
mel_energies_(other.mel_energies_.Dim(), kUndefined) {
|
||||
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
|
||||
iter != mel_banks_.end(); ++iter)
|
||||
iter->second = new MelBanks(*(iter->second));
|
||||
if (other.srfft_ != NULL)
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
|
||||
}
|
||||
|
||||
|
||||
|
||||
MfccComputer::~MfccComputer() {
|
||||
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
|
||||
iter != mel_banks_.end();
|
||||
++iter)
|
||||
delete iter->second;
|
||||
delete srfft_;
|
||||
}
|
||||
|
||||
const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) {
|
||||
MelBanks *this_mel_banks = NULL;
|
||||
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
|
||||
if (iter == mel_banks_.end()) {
|
||||
this_mel_banks = new MelBanks(opts_.mel_opts,
|
||||
opts_.frame_opts,
|
||||
vtln_warp);
|
||||
mel_banks_[vtln_warp] = this_mel_banks;
|
||||
} else {
|
||||
this_mel_banks = iter->second;
|
||||
}
|
||||
return this_mel_banks;
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,154 +0,0 @@
|
||||
// feat/feature-mfcc.h
|
||||
|
||||
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University
|
||||
// 2014-2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_MFCC_H_
|
||||
#define KALDI_FEAT_FEATURE_MFCC_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "feat/feature-common.h"
|
||||
#include "feat/feature-functions.h"
|
||||
#include "feat/feature-window.h"
|
||||
#include "feat/mel-computations.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
/// MfccOptions contains basic options for computing MFCC features.
|
||||
struct MfccOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
MelBanksOptions mel_opts;
|
||||
int32 num_ceps; // e.g. 13: num cepstral coeffs, counting zero.
|
||||
bool use_energy; // use energy; else C0
|
||||
BaseFloat energy_floor; // 0 by default; set to a value like 1.0 or 0.1 if
|
||||
// you disable dithering.
|
||||
bool raw_energy; // If true, compute energy before preemphasis and windowing
|
||||
BaseFloat cepstral_lifter; // Scaling factor on cepstra for HTK compatibility.
|
||||
// if 0.0, no liftering is done.
|
||||
bool htk_compat; // if true, put energy/C0 last and introduce a factor of
|
||||
// sqrt(2) on C0 to be the same as HTK.
|
||||
|
||||
MfccOptions() : mel_opts(23),
|
||||
// defaults the #mel-banks to 23 for the MFCC computations.
|
||||
// this seems to be common for 16khz-sampled data,
|
||||
// but for 8khz-sampled data, 15 may be better.
|
||||
num_ceps(13),
|
||||
use_energy(true),
|
||||
energy_floor(0.0),
|
||||
raw_energy(true),
|
||||
cepstral_lifter(22.0),
|
||||
htk_compat(false) {}
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
frame_opts.Register(opts);
|
||||
mel_opts.Register(opts);
|
||||
opts->Register("num-ceps", &num_ceps,
|
||||
"Number of cepstra in MFCC computation (including C0)");
|
||||
opts->Register("use-energy", &use_energy,
|
||||
"Use energy (not C0) in MFCC computation");
|
||||
opts->Register("energy-floor", &energy_floor,
|
||||
"Floor on energy (absolute, not relative) in MFCC computation. "
|
||||
"Only makes a difference if --use-energy=true; only necessary if "
|
||||
"--dither=0.0. Suggested values: 0.1 or 1.0");
|
||||
opts->Register("raw-energy", &raw_energy,
|
||||
"If true, compute energy before preemphasis and windowing");
|
||||
opts->Register("cepstral-lifter", &cepstral_lifter,
|
||||
"Constant that controls scaling of MFCCs");
|
||||
opts->Register("htk-compat", &htk_compat,
|
||||
"If true, put energy or C0 last and use a factor of sqrt(2) on "
|
||||
"C0. Warning: not sufficient to get HTK compatible features "
|
||||
"(need to change other parameters).");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
// This is the new-style interface to the MFCC computation.
|
||||
class MfccComputer {
|
||||
public:
|
||||
typedef MfccOptions Options;
|
||||
explicit MfccComputer(const MfccOptions &opts);
|
||||
MfccComputer(const MfccComputer &other);
|
||||
|
||||
const FrameExtractionOptions &GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
int32 Dim() const { return opts_.num_ceps; }
|
||||
|
||||
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
|
||||
|
||||
/**
|
||||
Function that computes one frame of features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedsRawLogEnergy().
|
||||
@param [in] vtln_warp The VTLN warping factor that the user wants
|
||||
to be applied when computing features for this utterance. Will
|
||||
normally be 1.0, meaning no warping is to be done. The value will
|
||||
be ignored for feature types that don't support VLTN, such as
|
||||
spectrogram features.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written.
|
||||
*/
|
||||
void Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature);
|
||||
|
||||
~MfccComputer();
|
||||
private:
|
||||
// disallow assignment.
|
||||
MfccComputer &operator = (const MfccComputer &in);
|
||||
|
||||
protected:
|
||||
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
|
||||
|
||||
MfccOptions opts_;
|
||||
Vector<BaseFloat> lifter_coeffs_;
|
||||
Matrix<BaseFloat> dct_matrix_; // matrix we left-multiply by to perform DCT.
|
||||
BaseFloat log_energy_floor_;
|
||||
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
|
||||
SplitRadixRealFft<BaseFloat> *srfft_;
|
||||
|
||||
// note: mel_energies_ is specific to the frame we're processing, it's
|
||||
// just a temporary workspace.
|
||||
Vector<BaseFloat> mel_energies_;
|
||||
};
|
||||
|
||||
typedef OfflineFeatureTpl<MfccComputer> Mfcc;
|
||||
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_MFCC_H_
|
@ -1,191 +0,0 @@
|
||||
// feat/feature-plp.cc
|
||||
|
||||
// Copyright 2009-2011 Petr Motlicek; Karel Vesely
|
||||
// 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "feat/feature-plp.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
PlpComputer::PlpComputer(const PlpOptions &opts):
|
||||
opts_(opts), srfft_(NULL),
|
||||
mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
|
||||
autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
|
||||
lpc_coeffs_(opts_.lpc_order, kUndefined),
|
||||
raw_cepstrum_(opts_.lpc_order, kUndefined) {
|
||||
|
||||
if (opts.cepstral_lifter != 0.0) {
|
||||
lifter_coeffs_.Resize(opts.num_ceps);
|
||||
ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
|
||||
}
|
||||
InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2,
|
||||
&idft_bases_);
|
||||
|
||||
if (opts.energy_floor > 0.0)
|
||||
log_energy_floor_ = Log(opts.energy_floor);
|
||||
|
||||
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
|
||||
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
|
||||
|
||||
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
|
||||
// [note: this call caches it.]
|
||||
GetMelBanks(1.0);
|
||||
}
|
||||
|
||||
PlpComputer::PlpComputer(const PlpComputer &other):
|
||||
opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
|
||||
idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
|
||||
mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
|
||||
srfft_(NULL),
|
||||
mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
|
||||
autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
|
||||
lpc_coeffs_(opts_.lpc_order, kUndefined),
|
||||
raw_cepstrum_(opts_.lpc_order, kUndefined) {
|
||||
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
|
||||
iter != mel_banks_.end(); ++iter)
|
||||
iter->second = new MelBanks(*(iter->second));
|
||||
for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
|
||||
iter = equal_loudness_.begin();
|
||||
iter != equal_loudness_.end(); ++iter)
|
||||
iter->second = new Vector<BaseFloat>(*(iter->second));
|
||||
if (other.srfft_ != NULL)
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
|
||||
}
|
||||
|
||||
PlpComputer::~PlpComputer() {
|
||||
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
|
||||
iter != mel_banks_.end(); ++iter)
|
||||
delete iter->second;
|
||||
for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
|
||||
iter = equal_loudness_.begin();
|
||||
iter != equal_loudness_.end(); ++iter)
|
||||
delete iter->second;
|
||||
delete srfft_;
|
||||
}
|
||||
|
||||
const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
|
||||
MelBanks *this_mel_banks = NULL;
|
||||
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
|
||||
if (iter == mel_banks_.end()) {
|
||||
this_mel_banks = new MelBanks(opts_.mel_opts,
|
||||
opts_.frame_opts,
|
||||
vtln_warp);
|
||||
mel_banks_[vtln_warp] = this_mel_banks;
|
||||
} else {
|
||||
this_mel_banks = iter->second;
|
||||
}
|
||||
return this_mel_banks;
|
||||
}
|
||||
|
||||
const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
|
||||
const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
|
||||
Vector<BaseFloat> *ans = NULL;
|
||||
std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
|
||||
= equal_loudness_.find(vtln_warp);
|
||||
if (iter == equal_loudness_.end()) {
|
||||
ans = new Vector<BaseFloat>;
|
||||
GetEqualLoudnessVector(*this_mel_banks, ans);
|
||||
equal_loudness_[vtln_warp] = ans;
|
||||
} else {
|
||||
ans = iter->second;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature) {
|
||||
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
|
||||
feature->Dim() == this->Dim());
|
||||
|
||||
const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
|
||||
const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
|
||||
|
||||
|
||||
KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1); // our num-ceps includes C0.
|
||||
|
||||
|
||||
if (opts_.use_energy && !opts_.raw_energy)
|
||||
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
|
||||
std::numeric_limits<float>::min()));
|
||||
|
||||
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
|
||||
srfft_->Compute(signal_frame->Data(), true);
|
||||
else // An alternative algorithm that works for non-powers-of-two.
|
||||
RealFft(signal_frame, true);
|
||||
|
||||
// Convert the FFT into a power spectrum.
|
||||
ComputePowerSpectrum(signal_frame); // elements 0 ... signal_frame->Dim()/2
|
||||
|
||||
SubVector<BaseFloat> power_spectrum(*signal_frame,
|
||||
0, signal_frame->Dim() / 2 + 1);
|
||||
|
||||
int32 num_mel_bins = opts_.mel_opts.num_bins;
|
||||
|
||||
SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
|
||||
|
||||
mel_banks.Compute(power_spectrum, &mel_energies);
|
||||
|
||||
mel_energies.MulElements(equal_loudness);
|
||||
|
||||
mel_energies.ApplyPow(opts_.compress_factor);
|
||||
|
||||
// duplicate first and last elements
|
||||
mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
|
||||
mel_energies_duplicated_(num_mel_bins + 1) =
|
||||
mel_energies_duplicated_(num_mel_bins);
|
||||
|
||||
autocorr_coeffs_.SetZero(); // In case of NaNs or infs
|
||||
autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
|
||||
mel_energies_duplicated_, 0.0);
|
||||
|
||||
BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
|
||||
|
||||
residual_log_energy = std::max<BaseFloat>(residual_log_energy,
|
||||
std::numeric_limits<float>::min());
|
||||
|
||||
Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
|
||||
feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
|
||||
raw_cepstrum_.Range(0, opts_.num_ceps - 1));
|
||||
(*feature)(0) = residual_log_energy;
|
||||
|
||||
if (opts_.cepstral_lifter != 0.0)
|
||||
feature->MulElements(lifter_coeffs_);
|
||||
|
||||
if (opts_.cepstral_scale != 1.0)
|
||||
feature->Scale(opts_.cepstral_scale);
|
||||
|
||||
if (opts_.use_energy) {
|
||||
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
|
||||
signal_raw_log_energy = log_energy_floor_;
|
||||
(*feature)(0) = signal_raw_log_energy;
|
||||
}
|
||||
|
||||
if (opts_.htk_compat) { // reorder the features.
|
||||
BaseFloat log_energy = (*feature)(0);
|
||||
for (int32 i = 0; i < opts_.num_ceps-1; i++)
|
||||
(*feature)(i) = (*feature)(i+1);
|
||||
(*feature)(opts_.num_ceps-1) = log_energy;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,176 +0,0 @@
|
||||
// feat/feature-plp.h
|
||||
|
||||
// Copyright 2009-2011 Petr Motlicek; Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_PLP_H_
|
||||
#define KALDI_FEAT_FEATURE_PLP_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "feat/feature-common.h"
|
||||
#include "feat/feature-functions.h"
|
||||
#include "feat/feature-window.h"
|
||||
#include "feat/mel-computations.h"
|
||||
#include "util/options-itf.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
|
||||
/// PlpOptions contains basic options for computing PLP features.
|
||||
/// It only includes things that can be done in a "stateless" way, i.e.
|
||||
/// it does not include energy max-normalization.
|
||||
/// It does not include delta computation.
|
||||
struct PlpOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
MelBanksOptions mel_opts;
|
||||
int32 lpc_order;
|
||||
int32 num_ceps; // num cepstra including zero
|
||||
bool use_energy; // use energy; else C0
|
||||
BaseFloat energy_floor;
|
||||
bool raw_energy; // If true, compute energy before preemphasis and windowing
|
||||
BaseFloat compress_factor;
|
||||
int32 cepstral_lifter;
|
||||
BaseFloat cepstral_scale;
|
||||
|
||||
bool htk_compat; // if true, put energy/C0 last and introduce a factor of
|
||||
// sqrt(2) on C0 to be the same as HTK.
|
||||
|
||||
PlpOptions() : mel_opts(23),
|
||||
// default number of mel-banks for the PLP computation; this
|
||||
// seems to be common for 16kHz-sampled data. For 8kHz-sampled
|
||||
// data, 15 may be better.
|
||||
lpc_order(12),
|
||||
num_ceps(13),
|
||||
use_energy(true),
|
||||
energy_floor(0.0),
|
||||
raw_energy(true),
|
||||
compress_factor(0.33333),
|
||||
cepstral_lifter(22),
|
||||
cepstral_scale(1.0),
|
||||
htk_compat(false) {}
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
frame_opts.Register(opts);
|
||||
mel_opts.Register(opts);
|
||||
opts->Register("lpc-order", &lpc_order,
|
||||
"Order of LPC analysis in PLP computation");
|
||||
opts->Register("num-ceps", &num_ceps,
|
||||
"Number of cepstra in PLP computation (including C0)");
|
||||
opts->Register("use-energy", &use_energy,
|
||||
"Use energy (not C0) for zeroth PLP feature");
|
||||
opts->Register("energy-floor", &energy_floor,
|
||||
"Floor on energy (absolute, not relative) in PLP computation. "
|
||||
"Only makes a difference if --use-energy=true; only necessary if "
|
||||
"--dither=0.0. Suggested values: 0.1 or 1.0");
|
||||
opts->Register("raw-energy", &raw_energy,
|
||||
"If true, compute energy before preemphasis and windowing");
|
||||
opts->Register("compress-factor", &compress_factor,
|
||||
"Compression factor in PLP computation");
|
||||
opts->Register("cepstral-lifter", &cepstral_lifter,
|
||||
"Constant that controls scaling of PLPs");
|
||||
opts->Register("cepstral-scale", &cepstral_scale,
|
||||
"Scaling constant in PLP computation");
|
||||
opts->Register("htk-compat", &htk_compat,
|
||||
"If true, put energy or C0 last. Warning: not sufficient "
|
||||
"to get HTK compatible features (need to change other "
|
||||
"parameters).");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// This is the new-style interface to the PLP computation.
|
||||
class PlpComputer {
|
||||
public:
|
||||
typedef PlpOptions Options;
|
||||
explicit PlpComputer(const PlpOptions &opts);
|
||||
PlpComputer(const PlpComputer &other);
|
||||
|
||||
const FrameExtractionOptions &GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
int32 Dim() const { return opts_.num_ceps; }
|
||||
|
||||
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
|
||||
|
||||
/**
|
||||
Function that computes one frame of features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedsRawLogEnergy().
|
||||
@param [in] vtln_warp The VTLN warping factor that the user wants
|
||||
to be applied when computing features for this utterance. Will
|
||||
normally be 1.0, meaning no warping is to be done. The value will
|
||||
be ignored for feature types that don't support VLTN, such as
|
||||
spectrogram features.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written.
|
||||
*/
|
||||
void Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature);
|
||||
|
||||
~PlpComputer();
|
||||
private:
|
||||
|
||||
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
|
||||
|
||||
const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp);
|
||||
|
||||
PlpOptions opts_;
|
||||
Vector<BaseFloat> lifter_coeffs_;
|
||||
Matrix<BaseFloat> idft_bases_;
|
||||
BaseFloat log_energy_floor_;
|
||||
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
|
||||
std::map<BaseFloat, Vector<BaseFloat>* > equal_loudness_;
|
||||
SplitRadixRealFft<BaseFloat> *srfft_;
|
||||
|
||||
// temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2
|
||||
Vector<BaseFloat> mel_energies_duplicated_;
|
||||
// temporary vector used inside Compute; size is opts_.lpc_order + 1
|
||||
Vector<BaseFloat> autocorr_coeffs_;
|
||||
// temporary vector used inside Compute; size is opts_.lpc_order
|
||||
Vector<BaseFloat> lpc_coeffs_;
|
||||
// temporary vector used inside Compute; size is opts_.lpc_order
|
||||
Vector<BaseFloat> raw_cepstrum_;
|
||||
|
||||
// Disallow assignment.
|
||||
PlpComputer &operator =(const PlpComputer &other);
|
||||
};
|
||||
|
||||
typedef OfflineFeatureTpl<PlpComputer> Plp;
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_PLP_H_
|
@ -1,82 +0,0 @@
|
||||
// feat/feature-spectrogram.cc
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// Copyright 2012 Navdeep Jaitly
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "feat/feature-spectrogram.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts)
|
||||
: opts_(opts), srfft_(NULL) {
|
||||
if (opts.energy_floor > 0.0)
|
||||
log_energy_floor_ = Log(opts.energy_floor);
|
||||
|
||||
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
|
||||
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
|
||||
}
|
||||
|
||||
SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other):
|
||||
opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) {
|
||||
if (other.srfft_ != NULL)
|
||||
srfft_ = new SplitRadixRealFft<BaseFloat>(*other.srfft_);
|
||||
}
|
||||
|
||||
SpectrogramComputer::~SpectrogramComputer() {
|
||||
delete srfft_;
|
||||
}
|
||||
|
||||
void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature) {
|
||||
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
|
||||
feature->Dim() == this->Dim());
|
||||
|
||||
|
||||
// Compute energy after window function (not the raw one)
|
||||
if (!opts_.raw_energy)
|
||||
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
|
||||
std::numeric_limits<float>::epsilon()));
|
||||
|
||||
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
|
||||
srfft_->Compute(signal_frame->Data(), true);
|
||||
else // An alternative algorithm that works for non-powers-of-two
|
||||
RealFft(signal_frame, true);
|
||||
|
||||
// Convert the FFT into a power spectrum.
|
||||
ComputePowerSpectrum(signal_frame);
|
||||
SubVector<BaseFloat> power_spectrum(*signal_frame,
|
||||
0, signal_frame->Dim() / 2 + 1);
|
||||
|
||||
power_spectrum.ApplyFloor(std::numeric_limits<float>::epsilon());
|
||||
power_spectrum.ApplyLog();
|
||||
|
||||
feature->CopyFromVec(power_spectrum);
|
||||
|
||||
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
|
||||
signal_raw_log_energy = log_energy_floor_;
|
||||
// The zeroth spectrogram component is always set to the signal energy,
|
||||
// instead of the square of the constant component of the signal.
|
||||
(*feature)(0) = signal_raw_log_energy;
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
@ -1,117 +0,0 @@
|
||||
// feat/feature-spectrogram.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// Copyright 2012 Navdeep Jaitly
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_SPECTROGRAM_H_
|
||||
#define KALDI_FEAT_FEATURE_SPECTROGRAM_H_
|
||||
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "feat/feature-common.h"
|
||||
#include "feat/feature-functions.h"
|
||||
#include "feat/feature-window.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
/// SpectrogramOptions contains basic options for computing spectrogram
|
||||
/// features.
|
||||
struct SpectrogramOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
BaseFloat energy_floor;
|
||||
bool raw_energy; // If true, compute energy before preemphasis and windowing
|
||||
|
||||
SpectrogramOptions() :
|
||||
energy_floor(0.0),
|
||||
raw_energy(true) {}
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
frame_opts.Register(opts);
|
||||
opts->Register("energy-floor", &energy_floor,
|
||||
"Floor on energy (absolute, not relative) in Spectrogram "
|
||||
"computation. Caution: this floor is applied to the zeroth "
|
||||
"component, representing the total signal energy. The "
|
||||
"floor on the individual spectrogram elements is fixed at "
|
||||
"std::numeric_limits<float>::epsilon().");
|
||||
opts->Register("raw-energy", &raw_energy,
|
||||
"If true, compute energy before preemphasis and windowing");
|
||||
}
|
||||
};
|
||||
|
||||
/// Class for computing spectrogram features.
|
||||
class SpectrogramComputer {
|
||||
public:
|
||||
typedef SpectrogramOptions Options;
|
||||
explicit SpectrogramComputer(const SpectrogramOptions &opts);
|
||||
SpectrogramComputer(const SpectrogramComputer &other);
|
||||
|
||||
const FrameExtractionOptions& GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
|
||||
|
||||
bool NeedRawLogEnergy() const { return opts_.raw_energy; }
|
||||
|
||||
|
||||
/**
|
||||
Function that computes one frame of spectrogram features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedsRawLogEnergy().
|
||||
@param [in] vtln_warp This is ignored by this function, it's only
|
||||
needed for interface compatibility.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written.
|
||||
*/
|
||||
void Compute(BaseFloat signal_raw_log_energy,
|
||||
BaseFloat vtln_warp,
|
||||
VectorBase<BaseFloat> *signal_frame,
|
||||
VectorBase<BaseFloat> *feature);
|
||||
|
||||
~SpectrogramComputer();
|
||||
|
||||
private:
|
||||
SpectrogramOptions opts_;
|
||||
BaseFloat log_energy_floor_;
|
||||
SplitRadixRealFft<BaseFloat> *srfft_;
|
||||
|
||||
// Disallow assignment.
|
||||
SpectrogramComputer &operator=(const SpectrogramComputer &other);
|
||||
};
|
||||
|
||||
typedef OfflineFeatureTpl<SpectrogramComputer> Spectrogram;
|
||||
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_SPECTROGRAM_H_
|
@ -1,222 +0,0 @@
|
||||
// feat/feature-window.cc
|
||||
|
||||
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
|
||||
// 2013-2016 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "feat/feature-window.h"
|
||||
#include "matrix/matrix-functions.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
int64 FirstSampleOfFrame(int32 frame,
|
||||
const FrameExtractionOptions &opts) {
|
||||
int64 frame_shift = opts.WindowShift();
|
||||
if (opts.snip_edges) {
|
||||
return frame * frame_shift;
|
||||
} else {
|
||||
int64 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
|
||||
beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
|
||||
return beginning_of_frame;
|
||||
}
|
||||
}
|
||||
|
||||
int32 NumFrames(int64 num_samples,
|
||||
const FrameExtractionOptions &opts,
|
||||
bool flush) {
|
||||
int64 frame_shift = opts.WindowShift();
|
||||
int64 frame_length = opts.WindowSize();
|
||||
if (opts.snip_edges) {
|
||||
// with --snip-edges=true (the default), we use a HTK-like approach to
|
||||
// determining the number of frames-- all frames have to fit completely into
|
||||
// the waveform, and the first frame begins at sample zero.
|
||||
if (num_samples < frame_length)
|
||||
return 0;
|
||||
else
|
||||
return (1 + ((num_samples - frame_length) / frame_shift));
|
||||
// You can understand the expression above as follows: 'num_samples -
|
||||
// frame_length' is how much room we have to shift the frame within the
|
||||
// waveform; 'frame_shift' is how much we shift it each time; and the ratio
|
||||
// is how many times we can shift it (integer arithmetic rounds down).
|
||||
} else {
|
||||
// if --snip-edges=false, the number of frames is determined by rounding the
|
||||
// (file-length / frame-shift) to the nearest integer. The point of this
|
||||
// formula is to make the number of frames an obvious and predictable
|
||||
// function of the frame shift and signal length, which makes many
|
||||
// segmentation-related questions simpler.
|
||||
//
|
||||
// Because integer division in C++ rounds toward zero, we add (half the
|
||||
// frame-shift minus epsilon) before dividing, to have the effect of
|
||||
// rounding towards the closest integer.
|
||||
int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
|
||||
|
||||
if (flush)
|
||||
return num_frames;
|
||||
|
||||
// note: 'end' always means the last plus one, i.e. one past the last.
|
||||
int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
|
||||
+ frame_length;
|
||||
|
||||
// the following code is optimized more for clarity than efficiency.
|
||||
// If flush == false, we can't output frames that extend past the end
|
||||
// of the signal.
|
||||
while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
|
||||
num_frames--;
|
||||
end_sample_of_last_frame -= frame_shift;
|
||||
}
|
||||
return num_frames;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
|
||||
if (dither_value == 0.0)
|
||||
return;
|
||||
int32 dim = waveform->Dim();
|
||||
BaseFloat *data = waveform->Data();
|
||||
RandomState rstate;
|
||||
for (int32 i = 0; i < dim; i++)
|
||||
data[i] += RandGauss(&rstate) * dither_value;
|
||||
}
|
||||
|
||||
|
||||
void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
|
||||
if (preemph_coeff == 0.0) return;
|
||||
KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
|
||||
for (int32 i = waveform->Dim()-1; i > 0; i--)
|
||||
(*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
|
||||
(*waveform)(0) -= preemph_coeff * (*waveform)(0);
|
||||
}
|
||||
|
||||
FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
|
||||
int32 frame_length = opts.WindowSize();
|
||||
KALDI_ASSERT(frame_length > 0);
|
||||
window.Resize(frame_length);
|
||||
double a = M_2PI / (frame_length-1);
|
||||
for (int32 i = 0; i < frame_length; i++) {
|
||||
double i_fl = static_cast<double>(i);
|
||||
if (opts.window_type == "hanning") {
|
||||
window(i) = 0.5 - 0.5*cos(a * i_fl);
|
||||
} else if (opts.window_type == "hamming") {
|
||||
window(i) = 0.54 - 0.46*cos(a * i_fl);
|
||||
} else if (opts.window_type == "povey") { // like hamming but goes to zero at edges.
|
||||
window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
|
||||
} else if (opts.window_type == "rectangular") {
|
||||
window(i) = 1.0;
|
||||
} else if (opts.window_type == "blackman") {
|
||||
window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
|
||||
(0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
|
||||
} else {
|
||||
KALDI_ERR << "Invalid window type " << opts.window_type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ProcessWindow(const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
VectorBase<BaseFloat> *window,
|
||||
BaseFloat *log_energy_pre_window) {
|
||||
int32 frame_length = opts.WindowSize();
|
||||
KALDI_ASSERT(window->Dim() == frame_length);
|
||||
|
||||
if (opts.dither != 0.0)
|
||||
Dither(window, opts.dither);
|
||||
|
||||
if (opts.remove_dc_offset)
|
||||
window->Add(-window->Sum() / frame_length);
|
||||
|
||||
if (log_energy_pre_window != NULL) {
|
||||
BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
|
||||
std::numeric_limits<float>::epsilon());
|
||||
*log_energy_pre_window = Log(energy);
|
||||
}
|
||||
|
||||
if (opts.preemph_coeff != 0.0)
|
||||
Preemphasize(window, opts.preemph_coeff);
|
||||
|
||||
window->MulElements(window_function.window);
|
||||
}
|
||||
|
||||
|
||||
// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
|
||||
// padded size. It does mean subtraction, pre-emphasis and dithering as
|
||||
// requested.
|
||||
void ExtractWindow(int64 sample_offset,
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
int32 f, // with 0 <= f < NumFrames(feats, opts)
|
||||
const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
Vector<BaseFloat> *window,
|
||||
BaseFloat *log_energy_pre_window) {
|
||||
KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
|
||||
int32 frame_length = opts.WindowSize(),
|
||||
frame_length_padded = opts.PaddedWindowSize();
|
||||
int64 num_samples = sample_offset + wave.Dim(),
|
||||
start_sample = FirstSampleOfFrame(f, opts),
|
||||
end_sample = start_sample + frame_length;
|
||||
|
||||
if (opts.snip_edges) {
|
||||
KALDI_ASSERT(start_sample >= sample_offset &&
|
||||
end_sample <= num_samples);
|
||||
} else {
|
||||
KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
|
||||
}
|
||||
|
||||
if (window->Dim() != frame_length_padded)
|
||||
window->Resize(frame_length_padded, kUndefined);
|
||||
|
||||
// wave_start and wave_end are start and end indexes into 'wave', for the
|
||||
// piece of wave that we're trying to extract.
|
||||
int32 wave_start = int32(start_sample - sample_offset),
|
||||
wave_end = wave_start + frame_length;
|
||||
if (wave_start >= 0 && wave_end <= wave.Dim()) {
|
||||
// the normal case-- no edge effects to consider.
|
||||
window->Range(0, frame_length).CopyFromVec(
|
||||
wave.Range(wave_start, frame_length));
|
||||
} else {
|
||||
// Deal with any end effects by reflection, if needed. This code will only
|
||||
// be reached for about two frames per utterance, so we don't concern
|
||||
// ourselves excessively with efficiency.
|
||||
int32 wave_dim = wave.Dim();
|
||||
for (int32 s = 0; s < frame_length; s++) {
|
||||
int32 s_in_wave = s + wave_start;
|
||||
while (s_in_wave < 0 || s_in_wave >= wave_dim) {
|
||||
// reflect around the beginning or end of the wave.
|
||||
// e.g. -1 -> 0, -2 -> 1.
|
||||
// dim -> dim - 1, dim + 1 -> dim - 2.
|
||||
// the code supports repeated reflections, although this
|
||||
// would only be needed in pathological cases.
|
||||
if (s_in_wave < 0) s_in_wave = - s_in_wave - 1;
|
||||
else s_in_wave = 2 * wave_dim - 1 - s_in_wave;
|
||||
}
|
||||
(*window)(s) = wave(s_in_wave);
|
||||
}
|
||||
}
|
||||
|
||||
if (frame_length_padded > frame_length)
|
||||
window->Range(frame_length, frame_length_padded - frame_length).SetZero();
|
||||
|
||||
SubVector<BaseFloat> frame(*window, 0, frame_length);
|
||||
|
||||
ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
@ -1,223 +0,0 @@
|
||||
// feat/feature-window.h
|
||||
|
||||
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University
|
||||
// 2014-2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_FEATURE_WINDOW_H_
|
||||
#define KALDI_FEAT_FEATURE_WINDOW_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "matrix/matrix-lib.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "base/kaldi-error.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
struct FrameExtractionOptions {
|
||||
BaseFloat samp_freq;
|
||||
BaseFloat frame_shift_ms; // in milliseconds.
|
||||
BaseFloat frame_length_ms; // in milliseconds.
|
||||
BaseFloat dither; // Amount of dithering, 0.0 means no dither.
|
||||
BaseFloat preemph_coeff; // Preemphasis coefficient.
|
||||
bool remove_dc_offset; // Subtract mean of wave before FFT.
|
||||
std::string window_type; // e.g. Hamming window
|
||||
// May be "hamming", "rectangular", "povey", "hanning", "blackman"
|
||||
// "povey" is a window I made to be similar to Hamming but to go to zero at the
|
||||
// edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
|
||||
// I just don't think the Hamming window makes sense as a windowing function.
|
||||
bool round_to_power_of_two;
|
||||
BaseFloat blackman_coeff;
|
||||
bool snip_edges;
|
||||
bool allow_downsample;
|
||||
bool allow_upsample;
|
||||
int max_feature_vectors;
|
||||
FrameExtractionOptions():
|
||||
samp_freq(16000),
|
||||
frame_shift_ms(10.0),
|
||||
frame_length_ms(25.0),
|
||||
dither(1.0),
|
||||
preemph_coeff(0.97),
|
||||
remove_dc_offset(true),
|
||||
window_type("povey"),
|
||||
round_to_power_of_two(true),
|
||||
blackman_coeff(0.42),
|
||||
snip_edges(true),
|
||||
allow_downsample(false),
|
||||
allow_upsample(false),
|
||||
max_feature_vectors(-1)
|
||||
{ }
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
opts->Register("sample-frequency", &samp_freq,
|
||||
"Waveform data sample frequency (must match the waveform file, "
|
||||
"if specified there)");
|
||||
opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
|
||||
opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
|
||||
opts->Register("preemphasis-coefficient", &preemph_coeff,
|
||||
"Coefficient for use in signal preemphasis");
|
||||
opts->Register("remove-dc-offset", &remove_dc_offset,
|
||||
"Subtract mean from waveform on each frame");
|
||||
opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). "
|
||||
"If you turn this off, you should set the --energy-floor "
|
||||
"option, e.g. to 1.0 or 0.1");
|
||||
opts->Register("window-type", &window_type, "Type of window "
|
||||
"(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
|
||||
"|\"blackmann\")");
|
||||
opts->Register("blackman-coeff", &blackman_coeff,
|
||||
"Constant coefficient for generalized Blackman window.");
|
||||
opts->Register("round-to-power-of-two", &round_to_power_of_two,
|
||||
"If true, round window size to power of two by zero-padding "
|
||||
"input to FFT.");
|
||||
opts->Register("snip-edges", &snip_edges,
|
||||
"If true, end effects will be handled by outputting only frames that "
|
||||
"completely fit in the file, and the number of frames depends on the "
|
||||
"frame-length. If false, the number of frames depends only on the "
|
||||
"frame-shift, and we reflect the data at the ends.");
|
||||
opts->Register("allow-downsample", &allow_downsample,
|
||||
"If true, allow the input waveform to have a higher frequency than "
|
||||
"the specified --sample-frequency (and we'll downsample).");
|
||||
opts->Register("max-feature-vectors", &max_feature_vectors,
|
||||
"Memory optimization. If larger than 0, periodically remove feature "
|
||||
"vectors so that only this number of the latest feature vectors is "
|
||||
"retained.");
|
||||
opts->Register("allow-upsample", &allow_upsample,
|
||||
"If true, allow the input waveform to have a lower frequency than "
|
||||
"the specified --sample-frequency (and we'll upsample).");
|
||||
}
|
||||
int32 WindowShift() const {
|
||||
return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
|
||||
}
|
||||
int32 WindowSize() const {
|
||||
return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
|
||||
}
|
||||
int32 PaddedWindowSize() const {
|
||||
return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
|
||||
WindowSize());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct FeatureWindowFunction {
|
||||
FeatureWindowFunction() {}
|
||||
explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
|
||||
FeatureWindowFunction(const FeatureWindowFunction &other):
|
||||
window(other.window) { }
|
||||
Vector<BaseFloat> window;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of frames that we can extract from a wave
|
||||
file with the given number of samples in it (assumed to have the same
|
||||
sampling rate as specified in 'opts').
|
||||
|
||||
@param [in] num_samples The number of samples in the wave file.
|
||||
@param [in] opts The frame-extraction options class
|
||||
|
||||
@param [in] flush True if we are asserting that this number of samples is
|
||||
'all there is', false if we expecting more data to possibly come
|
||||
in. This only makes a difference to the answer if opts.snips_edges
|
||||
== false. For offline feature extraction you always want flush ==
|
||||
true. In an online-decoding context, once you know (or decide) that
|
||||
no more data is coming in, you'd call it with flush == true at the
|
||||
end to flush out any remaining data.
|
||||
*/
|
||||
int32 NumFrames(int64 num_samples,
|
||||
const FrameExtractionOptions &opts,
|
||||
bool flush = true);
|
||||
|
||||
/*
|
||||
This function returns the index of the first sample of the frame indexed
|
||||
'frame'. If snip-edges=true, it just returns frame * opts.WindowShift(); if
|
||||
snip-edges=false, the formula is a little more complicated and the result may
|
||||
be negative.
|
||||
*/
|
||||
int64 FirstSampleOfFrame(int32 frame,
|
||||
const FrameExtractionOptions &opts);
|
||||
|
||||
|
||||
|
||||
void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
|
||||
|
||||
void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
|
||||
|
||||
/**
|
||||
This function does all the windowing steps after actually
|
||||
extracting the windowed signal: depending on the
|
||||
configuration, it does dithering, dc offset removal,
|
||||
preemphasis, and multiplication by the windowing function.
|
||||
@param [in] opts The options class to be used
|
||||
@param [in] window_function The windowing function-- should have
|
||||
been initialized using 'opts'.
|
||||
@param [in,out] window A vector of size opts.WindowSize(). Note:
|
||||
it will typically be a sub-vector of a larger vector of size
|
||||
opts.PaddedWindowSize(), with the remaining samples zero,
|
||||
as the FFT code is more efficient if it operates on data with
|
||||
power-of-two size.
|
||||
@param [out] log_energy_pre_window If non-NULL, then after dithering and
|
||||
DC offset removal, this function will write to this pointer the log of
|
||||
the total energy (i.e. sum-squared) of the frame.
|
||||
*/
|
||||
void ProcessWindow(const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
VectorBase<BaseFloat> *window,
|
||||
BaseFloat *log_energy_pre_window = NULL);
|
||||
|
||||
|
||||
/*
|
||||
ExtractWindow() extracts a windowed frame of waveform (possibly with a
|
||||
power-of-two, padded size, depending on the config), including all the
|
||||
proessing done by ProcessWindow().
|
||||
|
||||
@param [in] sample_offset If 'wave' is not the entire waveform, but
|
||||
part of it to the left has been discarded, then the
|
||||
number of samples prior to 'wave' that we have
|
||||
already discarded. Set this to zero if you are
|
||||
processing the entire waveform in one piece, or
|
||||
if you get 'no matching function' compilation
|
||||
errors when updating the code.
|
||||
@param [in] wave The waveform
|
||||
@param [in] f The frame index to be extracted, with
|
||||
0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
|
||||
@param [in] opts The options class to be used
|
||||
@param [in] window_function The windowing function, as derived from the
|
||||
options class.
|
||||
@param [out] window The windowed, possibly-padded waveform to be
|
||||
extracted. Will be resized as needed.
|
||||
@param [out] log_energy_pre_window If non-NULL, the log-energy of
|
||||
the signal prior to pre-emphasis and multiplying by
|
||||
the windowing function will be written to here.
|
||||
*/
|
||||
void ExtractWindow(int64 sample_offset,
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
int32 f,
|
||||
const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
Vector<BaseFloat> *window,
|
||||
BaseFloat *log_energy_pre_window = NULL);
|
||||
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_FEAT_FEATURE_WINDOW_H_
|
@ -1,340 +0,0 @@
|
||||
// feat/mel-computations.cc
|
||||
|
||||
// Copyright 2009-2011 Phonexia s.r.o.; Karel Vesely; Microsoft Corporation
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <float.h>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
#include "feat/feature-functions.h"
|
||||
#include "feat/feature-window.h"
|
||||
#include "feat/mel-computations.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
MelBanks::MelBanks(const MelBanksOptions &opts,
|
||||
const FrameExtractionOptions &frame_opts,
|
||||
BaseFloat vtln_warp_factor):
|
||||
htk_mode_(opts.htk_mode) {
|
||||
int32 num_bins = opts.num_bins;
|
||||
if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
|
||||
BaseFloat sample_freq = frame_opts.samp_freq;
|
||||
int32 window_length_padded = frame_opts.PaddedWindowSize();
|
||||
KALDI_ASSERT(window_length_padded % 2 == 0);
|
||||
int32 num_fft_bins = window_length_padded / 2;
|
||||
BaseFloat nyquist = 0.5 * sample_freq;
|
||||
|
||||
BaseFloat low_freq = opts.low_freq, high_freq;
|
||||
if (opts.high_freq > 0.0)
|
||||
high_freq = opts.high_freq;
|
||||
else
|
||||
high_freq = nyquist + opts.high_freq;
|
||||
|
||||
if (low_freq < 0.0 || low_freq >= nyquist
|
||||
|| high_freq <= 0.0 || high_freq > nyquist
|
||||
|| high_freq <= low_freq)
|
||||
KALDI_ERR << "Bad values in options: low-freq " << low_freq
|
||||
<< " and high-freq " << high_freq << " vs. nyquist "
|
||||
<< nyquist;
|
||||
|
||||
BaseFloat fft_bin_width = sample_freq / window_length_padded;
|
||||
// fft-bin width [think of it as Nyquist-freq / half-window-length]
|
||||
|
||||
BaseFloat mel_low_freq = MelScale(low_freq);
|
||||
BaseFloat mel_high_freq = MelScale(high_freq);
|
||||
|
||||
debug_ = opts.debug_mel;
|
||||
|
||||
// divide by num_bins+1 in next line because of end-effects where the bins
|
||||
// spread out to the sides.
|
||||
BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1);
|
||||
|
||||
BaseFloat vtln_low = opts.vtln_low,
|
||||
vtln_high = opts.vtln_high;
|
||||
if (vtln_high < 0.0) {
|
||||
vtln_high += nyquist;
|
||||
}
|
||||
|
||||
if (vtln_warp_factor != 1.0 &&
|
||||
(vtln_low < 0.0 || vtln_low <= low_freq
|
||||
|| vtln_low >= high_freq
|
||||
|| vtln_high <= 0.0 || vtln_high >= high_freq
|
||||
|| vtln_high <= vtln_low))
|
||||
KALDI_ERR << "Bad values in options: vtln-low " << vtln_low
|
||||
<< " and vtln-high " << vtln_high << ", versus "
|
||||
<< "low-freq " << low_freq << " and high-freq "
|
||||
<< high_freq;
|
||||
|
||||
bins_.resize(num_bins);
|
||||
center_freqs_.Resize(num_bins);
|
||||
|
||||
for (int32 bin = 0; bin < num_bins; bin++) {
|
||||
BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta,
|
||||
center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
|
||||
right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
|
||||
|
||||
if (vtln_warp_factor != 1.0) {
|
||||
left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, left_mel);
|
||||
center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, center_mel);
|
||||
right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, right_mel);
|
||||
}
|
||||
center_freqs_(bin) = InverseMelScale(center_mel);
|
||||
// this_bin will be a vector of coefficients that is only
|
||||
// nonzero where this mel bin is active.
|
||||
Vector<BaseFloat> this_bin(num_fft_bins);
|
||||
int32 first_index = -1, last_index = -1;
|
||||
for (int32 i = 0; i < num_fft_bins; i++) {
|
||||
BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft
|
||||
// bin.
|
||||
BaseFloat mel = MelScale(freq);
|
||||
if (mel > left_mel && mel < right_mel) {
|
||||
BaseFloat weight;
|
||||
if (mel <= center_mel)
|
||||
weight = (mel - left_mel) / (center_mel - left_mel);
|
||||
else
|
||||
weight = (right_mel-mel) / (right_mel-center_mel);
|
||||
this_bin(i) = weight;
|
||||
if (first_index == -1)
|
||||
first_index = i;
|
||||
last_index = i;
|
||||
}
|
||||
}
|
||||
//KALDI_ASSERT(first_index != -1 && last_index >= first_index
|
||||
// && "You may have set --num-mel-bins too large.");
|
||||
|
||||
bins_[bin].first = first_index;
|
||||
int32 size = last_index + 1 - first_index;
|
||||
bins_[bin].second.Resize(size);
|
||||
bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
|
||||
|
||||
// Replicate a bug in HTK, for testing purposes.
|
||||
if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
|
||||
bins_[bin].second(0) = 0.0;
|
||||
|
||||
}
|
||||
if (debug_) {
|
||||
for (size_t i = 0; i < bins_.size(); i++) {
|
||||
KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
|
||||
<< ", vec = " << bins_[i].second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MelBanks::MelBanks(const MelBanks &other):
|
||||
center_freqs_(other.center_freqs_),
|
||||
bins_(other.bins_),
|
||||
debug_(other.debug_),
|
||||
htk_mode_(other.htk_mode_) { }
|
||||
|
||||
BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
|
||||
BaseFloat vtln_high_cutoff,
|
||||
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
|
||||
BaseFloat high_freq,
|
||||
BaseFloat vtln_warp_factor,
|
||||
BaseFloat freq) {
|
||||
/// This computes a VTLN warping function that is not the same as HTK's one,
|
||||
/// but has similar inputs (this function has the advantage of never producing
|
||||
/// empty bins).
|
||||
|
||||
/// This function computes a warp function F(freq), defined between low_freq and
|
||||
/// high_freq inclusive, with the following properties:
|
||||
/// F(low_freq) == low_freq
|
||||
/// F(high_freq) == high_freq
|
||||
/// The function is continuous and piecewise linear with two inflection
|
||||
/// points.
|
||||
/// The lower inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at frequency l, determined as described below.
|
||||
/// The higher inflection point is at a frequency h, determined as
|
||||
/// described below.
|
||||
/// If l <= f <= h, then F(f) = f/vtln_warp_factor.
|
||||
/// If the higher inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
|
||||
/// Since (by the last point) F(h) == h/vtln_warp_factor, then
|
||||
/// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
|
||||
/// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
|
||||
/// = vtln_high_cutoff * min(1, vtln_warp_factor).
|
||||
/// If the lower inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
|
||||
/// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
|
||||
/// = vtln_low_cutoff * max(1, vtln_warp_factor)
|
||||
|
||||
|
||||
if (freq < low_freq || freq > high_freq) return freq; // in case this gets called
|
||||
// for out-of-range frequencies, just return the freq.
|
||||
|
||||
KALDI_ASSERT(vtln_low_cutoff > low_freq &&
|
||||
"be sure to set the --vtln-low option higher than --low-freq");
|
||||
KALDI_ASSERT(vtln_high_cutoff < high_freq &&
|
||||
"be sure to set the --vtln-high option lower than --high-freq [or negative]");
|
||||
BaseFloat one = 1.0;
|
||||
BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
|
||||
BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
|
||||
BaseFloat scale = 1.0 / vtln_warp_factor;
|
||||
BaseFloat Fl = scale * l; // F(l);
|
||||
BaseFloat Fh = scale * h; // F(h);
|
||||
KALDI_ASSERT(l > low_freq && h < high_freq);
|
||||
// slope of left part of the 3-piece linear function
|
||||
BaseFloat scale_left = (Fl - low_freq) / (l - low_freq);
|
||||
// [slope of center part is just "scale"]
|
||||
|
||||
// slope of right part of the 3-piece linear function
|
||||
BaseFloat scale_right = (high_freq - Fh) / (high_freq - h);
|
||||
|
||||
if (freq < l) {
|
||||
return low_freq + scale_left * (freq - low_freq);
|
||||
} else if (freq < h) {
|
||||
return scale * freq;
|
||||
} else { // freq >= h
|
||||
return high_freq + scale_right * (freq - high_freq);
|
||||
}
|
||||
}
|
||||
|
||||
BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
|
||||
BaseFloat vtln_high_cutoff,
|
||||
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
|
||||
BaseFloat high_freq,
|
||||
BaseFloat vtln_warp_factor,
|
||||
BaseFloat mel_freq) {
|
||||
return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
|
||||
low_freq, high_freq,
|
||||
vtln_warp_factor, InverseMelScale(mel_freq)));
|
||||
}
|
||||
|
||||
|
||||
// "power_spectrum" contains fft energies.
|
||||
void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
|
||||
VectorBase<BaseFloat> *mel_energies_out) const {
|
||||
int32 num_bins = bins_.size();
|
||||
KALDI_ASSERT(mel_energies_out->Dim() == num_bins);
|
||||
|
||||
for (int32 i = 0; i < num_bins; i++) {
|
||||
int32 offset = bins_[i].first;
|
||||
const Vector<BaseFloat> &v(bins_[i].second);
|
||||
BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
|
||||
// HTK-like flooring- for testing purposes (we prefer dither)
|
||||
if (htk_mode_ && energy < 1.0) energy = 1.0;
|
||||
(*mel_energies_out)(i) = energy;
|
||||
|
||||
// The following assert was added due to a problem with OpenBlas that
|
||||
// we had at one point (it was a bug in that library). Just to detect
|
||||
// it early.
|
||||
KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
|
||||
}
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "MEL BANKS:\n");
|
||||
for (int32 i = 0; i < num_bins; i++)
|
||||
fprintf(stderr, " %f", (*mel_energies_out)(i));
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
|
||||
// Compute liftering coefficients (scaling on cepstral coeffs)
|
||||
// coeffs are numbered slightly differently from HTK: the zeroth
|
||||
// index is C0, which is not affected.
|
||||
for (int32 i = 0; i < coeffs->Dim(); i++)
|
||||
(*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
|
||||
}
|
||||
|
||||
|
||||
// Durbin's recursion - converts autocorrelation coefficients to the LPC
|
||||
// pTmp - temporal place [n]
|
||||
// pAC - autocorrelation coefficients [n + 1]
|
||||
// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
|
||||
// F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
|
||||
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
|
||||
BaseFloat ki; // reflection coefficient
|
||||
int i;
|
||||
int j;
|
||||
|
||||
BaseFloat E = pAC[0];
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
// next reflection coefficient
|
||||
ki = pAC[i + 1];
|
||||
for (j = 0; j < i; j++)
|
||||
ki += pLP[j] * pAC[i - j];
|
||||
ki = ki / E;
|
||||
|
||||
// new error
|
||||
BaseFloat c = 1 - ki * ki;
|
||||
if (c < 1.0e-5) // remove NaNs for constan signal
|
||||
c = 1.0e-5;
|
||||
E *= c;
|
||||
|
||||
// new LP coefficients
|
||||
pTmp[i] = -ki;
|
||||
for (j = 0; j < i; j++)
|
||||
pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
|
||||
|
||||
for (j = 0; j <= i; j++)
|
||||
pLP[j] = pTmp[j];
|
||||
}
|
||||
|
||||
return E;
|
||||
}
|
||||
|
||||
|
||||
void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
|
||||
for (int32 i = 0; i < n; i++) {
|
||||
double sum = 0.0;
|
||||
int j;
|
||||
for (j = 0; j < i; j++) {
|
||||
sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
|
||||
}
|
||||
pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
void GetEqualLoudnessVector(const MelBanks &mel_banks,
|
||||
Vector<BaseFloat> *ans) {
|
||||
int32 n = mel_banks.NumBins();
|
||||
// Central frequency of each mel bin.
|
||||
const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
|
||||
ans->Resize(n);
|
||||
for (int32 i = 0; i < n; i++) {
|
||||
BaseFloat fsq = f0(i) * f0(i);
|
||||
BaseFloat fsub = fsq / (fsq + 1.6e5);
|
||||
(*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Compute LP coefficients from autocorrelation coefficients.
|
||||
BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
|
||||
Vector<BaseFloat> *lpc_out) {
|
||||
int32 n = autocorr_in.Dim() - 1;
|
||||
KALDI_ASSERT(lpc_out->Dim() == n);
|
||||
Vector<BaseFloat> tmp(n);
|
||||
BaseFloat ans = Durbin(n, autocorr_in.Data(),
|
||||
lpc_out->Data(),
|
||||
tmp.Data());
|
||||
if (ans <= 0.0)
|
||||
KALDI_WARN << "Zero energy in LPC computation";
|
||||
return -Log(1.0 / ans); // forms the C0 value
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,171 +0,0 @@
|
||||
// feat/mel-computations.h
|
||||
|
||||
// Copyright 2009-2011 Phonexia s.r.o.; Microsoft Corporation
|
||||
// 2016 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_MEL_COMPUTATIONS_H_
|
||||
#define KALDI_FEAT_MEL_COMPUTATIONS_H_
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <complex>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "matrix/matrix-lib.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
struct FrameExtractionOptions; // defined in feature-window.h
|
||||
|
||||
|
||||
struct MelBanksOptions {
|
||||
int32 num_bins; // e.g. 25; number of triangular bins
|
||||
BaseFloat low_freq; // e.g. 20; lower frequency cutoff
|
||||
BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative
|
||||
// ->added to the Nyquist frequency to get the cutoff.
|
||||
BaseFloat vtln_low; // vtln lower cutoff of warping function.
|
||||
BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added
|
||||
// to the Nyquist frequency to get the cutoff.
|
||||
bool debug_mel;
|
||||
// htk_mode is a "hidden" config, it does not show up on command line.
|
||||
// Enables more exact compatibility with HTK, for testing purposes. Affects
|
||||
// mel-energy flooring and reproduces a bug in HTK.
|
||||
bool htk_mode;
|
||||
explicit MelBanksOptions(int num_bins = 25)
|
||||
: num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
|
||||
vtln_high(-500), debug_mel(false), htk_mode(false) {}
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
opts->Register("num-mel-bins", &num_bins,
|
||||
"Number of triangular mel-frequency bins");
|
||||
opts->Register("low-freq", &low_freq,
|
||||
"Low cutoff frequency for mel bins");
|
||||
opts->Register("high-freq", &high_freq,
|
||||
"High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
|
||||
opts->Register("vtln-low", &vtln_low,
|
||||
"Low inflection point in piecewise linear VTLN warping function");
|
||||
opts->Register("vtln-high", &vtln_high,
|
||||
"High inflection point in piecewise linear VTLN warping function"
|
||||
" (if negative, offset from high-mel-freq");
|
||||
opts->Register("debug-mel", &debug_mel,
|
||||
"Print out debugging information for mel bin computation");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class MelBanks {
|
||||
public:
|
||||
|
||||
static inline BaseFloat InverseMelScale(BaseFloat mel_freq) {
|
||||
return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f);
|
||||
}
|
||||
|
||||
static inline BaseFloat MelScale(BaseFloat freq) {
|
||||
return 1127.0f * logf (1.0f + freq / 700.0f);
|
||||
}
|
||||
|
||||
static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff,
|
||||
BaseFloat vtln_high_cutoff, // discontinuities in warp func
|
||||
BaseFloat low_freq,
|
||||
BaseFloat high_freq, // upper+lower frequency cutoffs in
|
||||
// the mel computation
|
||||
BaseFloat vtln_warp_factor,
|
||||
BaseFloat freq);
|
||||
|
||||
static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,
|
||||
BaseFloat vtln_high_cutoff,
|
||||
BaseFloat low_freq,
|
||||
BaseFloat high_freq,
|
||||
BaseFloat vtln_warp_factor,
|
||||
BaseFloat mel_freq);
|
||||
|
||||
|
||||
MelBanks(const MelBanksOptions &opts,
|
||||
const FrameExtractionOptions &frame_opts,
|
||||
BaseFloat vtln_warp_factor);
|
||||
|
||||
/// Compute Mel energies (note: not log enerties).
|
||||
/// At input, "fft_energies" contains the FFT energies (not log).
|
||||
void Compute(const VectorBase<BaseFloat> &fft_energies,
|
||||
VectorBase<BaseFloat> *mel_energies_out) const;
|
||||
|
||||
int32 NumBins() const { return bins_.size(); }
|
||||
|
||||
// returns vector of central freq of each bin; needed by plp code.
|
||||
const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
|
||||
|
||||
const std::vector<std::pair<int32, Vector<BaseFloat> > >& GetBins() const {
|
||||
return bins_;
|
||||
}
|
||||
|
||||
// Copy constructor
|
||||
MelBanks(const MelBanks &other);
|
||||
private:
|
||||
// Disallow assignment
|
||||
MelBanks &operator = (const MelBanks &other);
|
||||
|
||||
// center frequencies of bins, numbered from 0 ... num_bins-1.
|
||||
// Needed by GetCenterFreqs().
|
||||
Vector<BaseFloat> center_freqs_;
|
||||
|
||||
// the "bins_" vector is a vector, one for each bin, of a pair:
|
||||
// (the first nonzero fft-bin), (the vector of weights).
|
||||
std::vector<std::pair<int32, Vector<BaseFloat> > > bins_;
|
||||
|
||||
bool debug_;
|
||||
bool htk_mode_;
|
||||
};
|
||||
|
||||
|
||||
// Compute liftering coefficients (scaling on cepstral coeffs)
|
||||
// coeffs are numbered slightly differently from HTK: the zeroth
|
||||
// index is C0, which is not affected.
|
||||
void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
|
||||
|
||||
|
||||
// Durbin's recursion - converts autocorrelation coefficients to the LPC
|
||||
// pTmp - temporal place [n]
|
||||
// pAC - autocorrelation coefficients [n + 1]
|
||||
// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
|
||||
// F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
|
||||
// Returns log energy of residual (I think)
|
||||
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
|
||||
|
||||
// Compute LP coefficients from autocorrelation coefficients.
|
||||
// Returns log energy of residual (I think)
|
||||
BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
|
||||
Vector<BaseFloat> *lpc_out);
|
||||
|
||||
void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
|
||||
|
||||
|
||||
|
||||
void GetEqualLoudnessVector(const MelBanks &mel_banks,
|
||||
Vector<BaseFloat> *ans);
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
|
||||
#endif // KALDI_FEAT_MEL_COMPUTATIONS_H_
|
@ -1,125 +0,0 @@
|
||||
// feat/online-feature-itf.h
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_ONLINE_FEATURE_ITF_H_
|
||||
#define KALDI_FEAT_ONLINE_FEATURE_ITF_H_ 1
|
||||
#include "base/kaldi-common.h"
|
||||
#include "matrix/matrix-lib.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @ingroup Interfaces
|
||||
/// @{
|
||||
|
||||
/**
|
||||
OnlineFeatureInterface is an interface for online feature processing (it is
|
||||
also usable in the offline setting, but currently we're not using it for
|
||||
that). This is for use in the online2/ directory, and it supersedes the
|
||||
interface in ../online/online-feat-input.h. We have a slightly different
|
||||
model that puts more control in the hands of the calling thread, and won't
|
||||
involve waiting on semaphores in the decoding thread.
|
||||
|
||||
This interface only specifies how the object *outputs* the features.
|
||||
How it obtains the features, e.g. from a previous object or objects of type
|
||||
OnlineFeatureInterface, is not specified in the interface and you will
|
||||
likely define new constructors or methods in the derived type to do that.
|
||||
|
||||
You should appreciate that this interface is designed to allow random
|
||||
access to features, as long as they are ready. That is, the user
|
||||
can call GetFrame for any frame less than NumFramesReady(), and when
|
||||
implementing a child class you must not make assumptions about the
|
||||
order in which the user makes these calls.
|
||||
*/
|
||||
|
||||
class OnlineFeatureInterface {
|
||||
public:
|
||||
virtual int32 Dim() const = 0; /// returns the feature dimension.
|
||||
|
||||
/// Returns the total number of frames, since the start of the utterance, that
|
||||
/// are now available. In an online-decoding context, this will likely
|
||||
/// increase with time as more data becomes available.
|
||||
virtual int32 NumFramesReady() const = 0;
|
||||
|
||||
/// Returns true if this is the last frame. Frame indices are zero-based, so the
|
||||
/// first frame is zero. IsLastFrame(-1) will return false, unless the file
|
||||
/// is empty (which is a case that I'm not sure all the code will handle, so
|
||||
/// be careful). This function may return false for some frame if
|
||||
/// we haven't yet decided to terminate decoding, but later true if we decide
|
||||
/// to terminate decoding. This function exists mainly to correctly handle
|
||||
/// end effects in feature extraction, and is not a mechanism to determine how
|
||||
/// many frames are in the decodable object (as it used to be, and for backward
|
||||
/// compatibility, still is, in the Decodable interface).
|
||||
virtual bool IsLastFrame(int32 frame) const = 0;
|
||||
|
||||
/// Gets the feature vector for this frame. Before calling this for a given
|
||||
/// frame, it is assumed that you called NumFramesReady() and it returned a
|
||||
/// number greater than "frame". Otherwise this call will likely crash with
|
||||
/// an assert failure. This function is not declared const, in case there is
|
||||
/// some kind of caching going on, but most of the time it shouldn't modify
|
||||
/// the class.
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
|
||||
|
||||
|
||||
/// This is like GetFrame() but for a collection of frames. There is a
|
||||
/// default implementation that just gets the frames one by one, but it
|
||||
/// may be overridden for efficiency by child classes (since sometimes
|
||||
/// it's more efficient to do things in a batch).
|
||||
virtual void GetFrames(const std::vector<int32> &frames,
|
||||
MatrixBase<BaseFloat> *feats) {
|
||||
KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
|
||||
for (size_t i = 0; i < frames.size(); i++) {
|
||||
SubVector<BaseFloat> feat(*feats, i);
|
||||
GetFrame(frames[i], &feat);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Returns frame shift in seconds. Helps to estimate duration from frame
|
||||
// counts.
|
||||
virtual BaseFloat FrameShiftInSeconds() const = 0;
|
||||
|
||||
/// Virtual destructor. Note: constructors that take another member of
|
||||
/// type OnlineFeatureInterface are not expected to take ownership of
|
||||
/// that pointer; the caller needs to keep track of that manually.
|
||||
virtual ~OnlineFeatureInterface() { }
|
||||
|
||||
};
|
||||
|
||||
|
||||
/// Add a virtual class for "source" features such as MFCC or PLP or pitch
|
||||
/// features.
|
||||
class OnlineBaseFeature: public OnlineFeatureInterface {
|
||||
public:
|
||||
/// This would be called from the application, when you get more wave data.
|
||||
/// Note: the sampling_rate is typically only provided so the code can assert
|
||||
/// that it matches the sampling rate expected in the options.
|
||||
virtual void AcceptWaveform(BaseFloat sampling_rate,
|
||||
const VectorBase<BaseFloat> &waveform) = 0;
|
||||
|
||||
/// InputFinished() tells the class you won't be providing any
|
||||
/// more waveform. This will help flush out the last few frames
|
||||
/// of delta or LDA features (it will typically affect the return value
|
||||
/// of IsLastFrame.
|
||||
virtual void InputFinished() = 0;
|
||||
};
|
||||
|
||||
|
||||
/// @}
|
||||
} // namespace Kaldi
|
||||
|
||||
#endif // KALDI_ITF_ONLINE_FEATURE_ITF_H_
|
@ -1,679 +0,0 @@
|
||||
// feat/online-feature.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2014 Yanqing Sun, Junjie Wang,
|
||||
// Daniel Povey, Korbinian Riedhammer
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "feat/online-feature.h"
|
||||
#include "transform/cmvn.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
RecyclingVector::RecyclingVector(int items_to_hold):
|
||||
items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
|
||||
first_available_index_(0) {
|
||||
}
|
||||
|
||||
RecyclingVector::~RecyclingVector() {
|
||||
for (auto *item : items_) {
|
||||
delete item;
|
||||
}
|
||||
}
|
||||
|
||||
Vector<BaseFloat> *RecyclingVector::At(int index) const {
|
||||
if (index < first_available_index_) {
|
||||
KALDI_ERR << "Attempted to retrieve feature vector that was "
|
||||
"already removed by the RecyclingVector (index = "
|
||||
<< index << "; "
|
||||
<< "first_available_index = " << first_available_index_ << "; "
|
||||
<< "size = " << Size() << ")";
|
||||
}
|
||||
// 'at' does size checking.
|
||||
return items_.at(index - first_available_index_);
|
||||
}
|
||||
|
||||
void RecyclingVector::PushBack(Vector<BaseFloat> *item) {
|
||||
if (items_.size() == items_to_hold_) {
|
||||
delete items_.front();
|
||||
items_.pop_front();
|
||||
++first_available_index_;
|
||||
}
|
||||
items_.push_back(item);
|
||||
}
|
||||
|
||||
int RecyclingVector::Size() const {
|
||||
return first_available_index_ + items_.size();
|
||||
}
|
||||
|
||||
template <class C>
|
||||
void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
|
||||
VectorBase<BaseFloat> *feat) {
|
||||
feat->CopyFromVec(*(features_.At(frame)));
|
||||
};
|
||||
|
||||
template <class C>
|
||||
OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
|
||||
const typename C::Options &opts):
|
||||
computer_(opts), window_function_(computer_.GetFrameOptions()),
|
||||
features_(opts.frame_opts.max_feature_vectors),
|
||||
input_finished_(false), waveform_offset_(0) {
|
||||
// RE the following assert: search for ONLINE_IVECTOR_LIMIT in
|
||||
// online-ivector-feature.cc.
|
||||
// Casting to uint32, an unsigned type, means that -1 would be treated
|
||||
// as `very large`.
|
||||
KALDI_ASSERT(static_cast<uint32>(opts.frame_opts.max_feature_vectors) > 200);
|
||||
}
|
||||
|
||||
|
||||
template <class C>
|
||||
void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
|
||||
BaseFloat sampling_rate) {
|
||||
BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
|
||||
|
||||
if (resampler_ != nullptr) {
|
||||
KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
|
||||
KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
|
||||
} else if (((sampling_rate < expected_sampling_rate) &&
|
||||
computer_.GetFrameOptions().allow_downsample) ||
|
||||
((sampling_rate > expected_sampling_rate) &&
|
||||
computer_.GetFrameOptions().allow_upsample)) {
|
||||
resampler_.reset(new LinearResample(
|
||||
sampling_rate, expected_sampling_rate,
|
||||
std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
|
||||
} else if (sampling_rate != expected_sampling_rate) {
|
||||
KALDI_ERR << "Sampling frequency mismatch, expected "
|
||||
<< expected_sampling_rate << ", got " << sampling_rate
|
||||
<< "\nPerhaps you want to use the options "
|
||||
"--allow_{upsample,downsample}";
|
||||
}
|
||||
}
|
||||
|
||||
template <class C>
|
||||
void OnlineGenericBaseFeature<C>::InputFinished() {
|
||||
if (resampler_ != nullptr) {
|
||||
// There may be a few samples left once we flush the resampler_ object, telling it
|
||||
// that the file has finished. This should rarely make any difference.
|
||||
Vector<BaseFloat> appended_wave;
|
||||
Vector<BaseFloat> resampled_wave;
|
||||
resampler_->Resample(appended_wave, true, &resampled_wave);
|
||||
|
||||
if (resampled_wave.Dim() != 0) {
|
||||
appended_wave.Resize(waveform_remainder_.Dim() +
|
||||
resampled_wave.Dim());
|
||||
if (waveform_remainder_.Dim() != 0)
|
||||
appended_wave.Range(0, waveform_remainder_.Dim())
|
||||
.CopyFromVec(waveform_remainder_);
|
||||
appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
|
||||
.CopyFromVec(resampled_wave);
|
||||
waveform_remainder_.Swap(&appended_wave);
|
||||
}
|
||||
}
|
||||
input_finished_ = true;
|
||||
ComputeFeatures();
|
||||
}
|
||||
|
||||
template <class C>
|
||||
void OnlineGenericBaseFeature<C>::AcceptWaveform(
|
||||
BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
|
||||
if (original_waveform.Dim() == 0)
|
||||
return; // Nothing to do.
|
||||
if (input_finished_)
|
||||
KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
|
||||
|
||||
Vector<BaseFloat> appended_wave;
|
||||
Vector<BaseFloat> resampled_wave;
|
||||
|
||||
const VectorBase<BaseFloat> *waveform;
|
||||
|
||||
MaybeCreateResampler(sampling_rate);
|
||||
if (resampler_ == nullptr) {
|
||||
waveform = &original_waveform;
|
||||
} else {
|
||||
resampler_->Resample(original_waveform, false, &resampled_wave);
|
||||
waveform = &resampled_wave;
|
||||
}
|
||||
|
||||
appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
|
||||
if (waveform_remainder_.Dim() != 0)
|
||||
appended_wave.Range(0, waveform_remainder_.Dim())
|
||||
.CopyFromVec(waveform_remainder_);
|
||||
appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
|
||||
.CopyFromVec(*waveform);
|
||||
waveform_remainder_.Swap(&appended_wave);
|
||||
ComputeFeatures();
|
||||
}
|
||||
|
||||
template <class C>
|
||||
void OnlineGenericBaseFeature<C>::ComputeFeatures() {
|
||||
const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
|
||||
int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
|
||||
int32 num_frames_old = features_.Size(),
|
||||
num_frames_new = NumFrames(num_samples_total, frame_opts,
|
||||
input_finished_);
|
||||
KALDI_ASSERT(num_frames_new >= num_frames_old);
|
||||
|
||||
Vector<BaseFloat> window;
|
||||
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
|
||||
for (int32 frame = num_frames_old; frame < num_frames_new; frame++) {
|
||||
BaseFloat raw_log_energy = 0.0;
|
||||
ExtractWindow(waveform_offset_, waveform_remainder_, frame,
|
||||
frame_opts, window_function_, &window,
|
||||
need_raw_log_energy ? &raw_log_energy : NULL);
|
||||
Vector<BaseFloat> *this_feature = new Vector<BaseFloat>(computer_.Dim(),
|
||||
kUndefined);
|
||||
// note: this online feature-extraction code does not support VTLN.
|
||||
BaseFloat vtln_warp = 1.0;
|
||||
computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
|
||||
features_.PushBack(this_feature);
|
||||
}
|
||||
// OK, we will now discard any portion of the signal that will not be
|
||||
// necessary to compute frames in the future.
|
||||
int64 first_sample_of_next_frame = FirstSampleOfFrame(num_frames_new,
|
||||
frame_opts);
|
||||
int32 samples_to_discard = first_sample_of_next_frame - waveform_offset_;
|
||||
if (samples_to_discard > 0) {
|
||||
// discard the leftmost part of the waveform that we no longer need.
|
||||
int32 new_num_samples = waveform_remainder_.Dim() - samples_to_discard;
|
||||
if (new_num_samples <= 0) {
|
||||
// odd, but we'll try to handle it.
|
||||
waveform_offset_ += waveform_remainder_.Dim();
|
||||
waveform_remainder_.Resize(0);
|
||||
} else {
|
||||
Vector<BaseFloat> new_remainder(new_num_samples);
|
||||
new_remainder.CopyFromVec(waveform_remainder_.Range(samples_to_discard,
|
||||
new_num_samples));
|
||||
waveform_offset_ += samples_to_discard;
|
||||
waveform_remainder_.Swap(&new_remainder);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// instantiate the templates defined here for MFCC, PLP and filterbank classes.
|
||||
template class OnlineGenericBaseFeature<MfccComputer>;
|
||||
template class OnlineGenericBaseFeature<PlpComputer>;
|
||||
template class OnlineGenericBaseFeature<FbankComputer>;
|
||||
|
||||
OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
|
||||
speaker_cmvn_stats(other.speaker_cmvn_stats),
|
||||
global_cmvn_stats(other.global_cmvn_stats),
|
||||
frozen_state(other.frozen_state) { }
|
||||
|
||||
void OnlineCmvnState::Write(std::ostream &os, bool binary) const {
|
||||
WriteToken(os, binary, "<OnlineCmvnState>"); // magic string.
|
||||
WriteToken(os, binary, "<SpeakerCmvnStats>");
|
||||
speaker_cmvn_stats.Write(os, binary);
|
||||
WriteToken(os, binary, "<GlobalCmvnStats>");
|
||||
global_cmvn_stats.Write(os, binary);
|
||||
WriteToken(os, binary, "<FrozenState>");
|
||||
frozen_state.Write(os, binary);
|
||||
WriteToken(os, binary, "</OnlineCmvnState>");
|
||||
}
|
||||
|
||||
void OnlineCmvnState::Read(std::istream &is, bool binary) {
|
||||
ExpectToken(is, binary, "<OnlineCmvnState>"); // magic string.
|
||||
ExpectToken(is, binary, "<SpeakerCmvnStats>");
|
||||
speaker_cmvn_stats.Read(is, binary);
|
||||
ExpectToken(is, binary, "<GlobalCmvnStats>");
|
||||
global_cmvn_stats.Read(is, binary);
|
||||
ExpectToken(is, binary, "<FrozenState>");
|
||||
frozen_state.Read(is, binary);
|
||||
ExpectToken(is, binary, "</OnlineCmvnState>");
|
||||
}
|
||||
|
||||
OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
|
||||
const OnlineCmvnState &cmvn_state,
|
||||
OnlineFeatureInterface *src):
|
||||
opts_(opts), temp_stats_(2, src->Dim() + 1),
|
||||
temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
|
||||
src_(src) {
|
||||
SetState(cmvn_state);
|
||||
if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
|
||||
KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
|
||||
<< "integers)";
|
||||
}
|
||||
|
||||
OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
|
||||
OnlineFeatureInterface *src):
|
||||
opts_(opts), temp_stats_(2, src->Dim() + 1),
|
||||
temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
|
||||
src_(src) {
|
||||
if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
|
||||
KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
|
||||
<< "integers)";
|
||||
}
|
||||
|
||||
|
||||
void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
|
||||
int32 *cached_frame,
|
||||
MatrixBase<double> *stats) {
|
||||
KALDI_ASSERT(frame >= 0);
|
||||
InitRingBufferIfNeeded();
|
||||
// look for a cached frame on a previous frame as close as possible in time
|
||||
// to "frame". Return if we get one.
|
||||
for (int32 t = frame; t >= 0 && t >= frame - opts_.ring_buffer_size; t--) {
|
||||
if (t % opts_.modulus == 0) {
|
||||
// if this frame should be cached in cached_stats_modulo_, then
|
||||
// we'll look there, and we won't go back any further in time.
|
||||
break;
|
||||
}
|
||||
int32 index = t % opts_.ring_buffer_size;
|
||||
if (cached_stats_ring_[index].first == t) {
|
||||
*cached_frame = t;
|
||||
stats->CopyFromMat(cached_stats_ring_[index].second);
|
||||
return;
|
||||
}
|
||||
}
|
||||
int32 n = frame / opts_.modulus;
|
||||
if (n >= cached_stats_modulo_.size()) {
|
||||
if (cached_stats_modulo_.size() == 0) {
|
||||
*cached_frame = -1;
|
||||
stats->SetZero();
|
||||
return;
|
||||
} else {
|
||||
n = static_cast<int32>(cached_stats_modulo_.size() - 1);
|
||||
}
|
||||
}
|
||||
*cached_frame = n * opts_.modulus;
|
||||
KALDI_ASSERT(cached_stats_modulo_[n] != NULL);
|
||||
stats->CopyFromMat(*(cached_stats_modulo_[n]));
|
||||
}
|
||||
|
||||
// Initialize ring buffer for caching stats.
|
||||
void OnlineCmvn::InitRingBufferIfNeeded() {
|
||||
if (cached_stats_ring_.empty() && opts_.ring_buffer_size > 0) {
|
||||
Matrix<double> temp(2, this->Dim() + 1);
|
||||
cached_stats_ring_.resize(opts_.ring_buffer_size,
|
||||
std::pair<int32, Matrix<double> >(-1, temp));
|
||||
}
|
||||
}
|
||||
|
||||
void OnlineCmvn::CacheFrame(int32 frame, const MatrixBase<double> &stats) {
|
||||
KALDI_ASSERT(frame >= 0);
|
||||
if (frame % opts_.modulus == 0) { // store in cached_stats_modulo_.
|
||||
int32 n = frame / opts_.modulus;
|
||||
if (n >= cached_stats_modulo_.size()) {
|
||||
// The following assert is a limitation on in what order you can call
|
||||
// CacheFrame. Fortunately the calling code always calls it in sequence,
|
||||
// which it has to because you need a previous frame to compute the
|
||||
// current one.
|
||||
KALDI_ASSERT(n == cached_stats_modulo_.size());
|
||||
cached_stats_modulo_.push_back(new Matrix<double>(stats));
|
||||
} else {
|
||||
KALDI_WARN << "Did not expect to reach this part of code.";
|
||||
// do what seems right, but we shouldn't get here.
|
||||
cached_stats_modulo_[n]->CopyFromMat(stats);
|
||||
}
|
||||
} else { // store in the ring buffer.
|
||||
InitRingBufferIfNeeded();
|
||||
if (!cached_stats_ring_.empty()) {
|
||||
int32 index = frame % cached_stats_ring_.size();
|
||||
cached_stats_ring_[index].first = frame;
|
||||
cached_stats_ring_[index].second.CopyFromMat(stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OnlineCmvn::~OnlineCmvn() {
|
||||
for (size_t i = 0; i < cached_stats_modulo_.size(); i++)
|
||||
delete cached_stats_modulo_[i];
|
||||
cached_stats_modulo_.clear();
|
||||
}
|
||||
|
||||
void OnlineCmvn::ComputeStatsForFrame(int32 frame,
|
||||
MatrixBase<double> *stats_out) {
|
||||
KALDI_ASSERT(frame >= 0 && frame < src_->NumFramesReady());
|
||||
|
||||
int32 dim = this->Dim(), cur_frame;
|
||||
GetMostRecentCachedFrame(frame, &cur_frame, stats_out);
|
||||
|
||||
Vector<BaseFloat> &feats(temp_feats_);
|
||||
Vector<double> &feats_dbl(temp_feats_dbl_);
|
||||
while (cur_frame < frame) {
|
||||
cur_frame++;
|
||||
src_->GetFrame(cur_frame, &feats);
|
||||
feats_dbl.CopyFromVec(feats);
|
||||
stats_out->Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
|
||||
if (opts_.normalize_variance)
|
||||
stats_out->Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
|
||||
(*stats_out)(0, dim) += 1.0;
|
||||
// it's a sliding buffer; a frame at the back may be
|
||||
// leaving the buffer so we have to subtract that.
|
||||
int32 prev_frame = cur_frame - opts_.cmn_window;
|
||||
if (prev_frame >= 0) {
|
||||
// we need to subtract frame prev_f from the stats.
|
||||
src_->GetFrame(prev_frame, &feats);
|
||||
feats_dbl.CopyFromVec(feats);
|
||||
stats_out->Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
|
||||
if (opts_.normalize_variance)
|
||||
stats_out->Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
|
||||
(*stats_out)(0, dim) -= 1.0;
|
||||
}
|
||||
CacheFrame(cur_frame, (*stats_out));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// static
|
||||
void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
|
||||
const MatrixBase<double> &global_stats,
|
||||
const OnlineCmvnOptions &opts,
|
||||
MatrixBase<double> *stats) {
|
||||
if (speaker_stats.NumRows() == 2 && !opts.normalize_variance) {
|
||||
// this is just for efficiency: don't operate on the variance if it's not
|
||||
// needed.
|
||||
int32 cols = speaker_stats.NumCols(); // dim + 1
|
||||
SubMatrix<double> stats_temp(*stats, 0, 1, 0, cols);
|
||||
SmoothOnlineCmvnStats(speaker_stats.RowRange(0, 1),
|
||||
global_stats.RowRange(0, 1),
|
||||
opts, &stats_temp);
|
||||
return;
|
||||
}
|
||||
int32 dim = stats->NumCols() - 1;
|
||||
double cur_count = (*stats)(0, dim);
|
||||
// If count exceeded cmn_window it would be an error in how "window_stats"
|
||||
// was accumulated.
|
||||
KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
|
||||
if (cur_count >= opts.cmn_window)
|
||||
return;
|
||||
if (speaker_stats.NumRows() != 0) { // if we have speaker stats..
|
||||
double count_from_speaker = opts.cmn_window - cur_count,
|
||||
speaker_count = speaker_stats(0, dim);
|
||||
if (count_from_speaker > opts.speaker_frames)
|
||||
count_from_speaker = opts.speaker_frames;
|
||||
if (count_from_speaker > speaker_count)
|
||||
count_from_speaker = speaker_count;
|
||||
if (count_from_speaker > 0.0)
|
||||
stats->AddMat(count_from_speaker / speaker_count,
|
||||
speaker_stats);
|
||||
cur_count = (*stats)(0, dim);
|
||||
}
|
||||
if (cur_count >= opts.cmn_window)
|
||||
return;
|
||||
if (global_stats.NumRows() != 0) {
|
||||
double count_from_global = opts.cmn_window - cur_count,
|
||||
global_count = global_stats(0, dim);
|
||||
KALDI_ASSERT(global_count > 0.0);
|
||||
if (count_from_global > opts.global_frames)
|
||||
count_from_global = opts.global_frames;
|
||||
if (count_from_global > 0.0)
|
||||
stats->AddMat(count_from_global / global_count,
|
||||
global_stats);
|
||||
} else {
|
||||
KALDI_ERR << "Global CMN stats are required";
|
||||
}
|
||||
}
|
||||
|
||||
void OnlineCmvn::GetFrame(int32 frame,
|
||||
VectorBase<BaseFloat> *feat) {
|
||||
src_->GetFrame(frame, feat);
|
||||
KALDI_ASSERT(feat->Dim() == this->Dim());
|
||||
int32 dim = feat->Dim();
|
||||
Matrix<double> &stats(temp_stats_);
|
||||
stats.Resize(2, dim + 1, kUndefined); // Will do nothing if size was correct.
|
||||
if (frozen_state_.NumRows() != 0) { // the CMVN state has been frozen.
|
||||
stats.CopyFromMat(frozen_state_);
|
||||
} else {
|
||||
// first get the raw CMVN stats (this involves caching..)
|
||||
this->ComputeStatsForFrame(frame, &stats);
|
||||
// now smooth them.
|
||||
SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
|
||||
orig_state_.global_cmvn_stats,
|
||||
opts_,
|
||||
&stats);
|
||||
}
|
||||
|
||||
if (!skip_dims_.empty())
|
||||
FakeStatsForSomeDims(skip_dims_, &stats);
|
||||
|
||||
// call the function ApplyCmvn declared in ../transform/cmvn.h, which
|
||||
// requires a matrix.
|
||||
// 1 row; num-cols == dim; stride == dim.
|
||||
SubMatrix<BaseFloat> feat_mat(feat->Data(), 1, dim, dim);
|
||||
// the function ApplyCmvn takes a matrix, so form a one-row matrix to give it.
|
||||
if (opts_.normalize_mean)
|
||||
ApplyCmvn(stats, opts_.normalize_variance, &feat_mat);
|
||||
else
|
||||
KALDI_ASSERT(!opts_.normalize_variance);
|
||||
}
|
||||
|
||||
void OnlineCmvn::Freeze(int32 cur_frame) {
|
||||
int32 dim = this->Dim();
|
||||
Matrix<double> stats(2, dim + 1);
|
||||
// get the raw CMVN stats
|
||||
this->ComputeStatsForFrame(cur_frame, &stats);
|
||||
// now smooth them.
|
||||
SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
|
||||
orig_state_.global_cmvn_stats,
|
||||
opts_,
|
||||
&stats);
|
||||
this->frozen_state_ = stats;
|
||||
}
|
||||
|
||||
void OnlineCmvn::GetState(int32 cur_frame,
|
||||
OnlineCmvnState *state_out) {
|
||||
*state_out = this->orig_state_;
|
||||
{ // This block updates state_out->speaker_cmvn_stats
|
||||
int32 dim = this->Dim();
|
||||
if (state_out->speaker_cmvn_stats.NumRows() == 0)
|
||||
state_out->speaker_cmvn_stats.Resize(2, dim + 1);
|
||||
Vector<BaseFloat> feat(dim);
|
||||
Vector<double> feat_dbl(dim);
|
||||
for (int32 t = 0; t <= cur_frame; t++) {
|
||||
src_->GetFrame(t, &feat);
|
||||
feat_dbl.CopyFromVec(feat);
|
||||
state_out->speaker_cmvn_stats(0, dim) += 1.0;
|
||||
state_out->speaker_cmvn_stats.Row(0).Range(0, dim).AddVec(1.0, feat_dbl);
|
||||
state_out->speaker_cmvn_stats.Row(1).Range(0, dim).AddVec2(1.0, feat_dbl);
|
||||
}
|
||||
}
|
||||
// Store any frozen state (the effect of the user possibly
|
||||
// having called Freeze().
|
||||
state_out->frozen_state = frozen_state_;
|
||||
}
|
||||
|
||||
void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
|
||||
KALDI_ASSERT(cached_stats_modulo_.empty() &&
|
||||
"You cannot call SetState() after processing data.");
|
||||
orig_state_ = cmvn_state;
|
||||
frozen_state_ = cmvn_state.frozen_state;
|
||||
}
|
||||
|
||||
int32 OnlineSpliceFrames::NumFramesReady() const {
|
||||
int32 num_frames = src_->NumFramesReady();
|
||||
if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
|
||||
return num_frames;
|
||||
else
|
||||
return std::max<int32>(0, num_frames - right_context_);
|
||||
}
|
||||
|
||||
void OnlineSpliceFrames::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
|
||||
KALDI_ASSERT(left_context_ >= 0 && right_context_ >= 0);
|
||||
KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
|
||||
int32 dim_in = src_->Dim();
|
||||
KALDI_ASSERT(feat->Dim() == dim_in * (1 + left_context_ + right_context_));
|
||||
int32 T = src_->NumFramesReady();
|
||||
for (int32 t2 = frame - left_context_; t2 <= frame + right_context_; t2++) {
|
||||
int32 t2_limited = t2;
|
||||
if (t2_limited < 0) t2_limited = 0;
|
||||
if (t2_limited >= T) t2_limited = T - 1;
|
||||
int32 n = t2 - (frame - left_context_); // 0 for left-most frame,
|
||||
// increases to the right.
|
||||
SubVector<BaseFloat> part(*feat, n * dim_in, dim_in);
|
||||
src_->GetFrame(t2_limited, &part);
|
||||
}
|
||||
}
|
||||
|
||||
OnlineTransform::OnlineTransform(const MatrixBase<BaseFloat> &transform,
|
||||
OnlineFeatureInterface *src):
|
||||
src_(src) {
|
||||
int32 src_dim = src_->Dim();
|
||||
if (transform.NumCols() == src_dim) { // Linear transform
|
||||
linear_term_ = transform;
|
||||
offset_.Resize(transform.NumRows()); // Resize() will zero it.
|
||||
} else if (transform.NumCols() == src_dim + 1) { // Affine transform
|
||||
linear_term_ = transform.Range(0, transform.NumRows(), 0, src_dim);
|
||||
offset_.Resize(transform.NumRows());
|
||||
offset_.CopyColFromMat(transform, src_dim);
|
||||
} else {
|
||||
KALDI_ERR << "Dimension mismatch: source features have dimension "
|
||||
<< src_dim << " and LDA #cols is " << transform.NumCols();
|
||||
}
|
||||
}
|
||||
|
||||
void OnlineTransform::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
|
||||
Vector<BaseFloat> input_feat(linear_term_.NumCols());
|
||||
src_->GetFrame(frame, &input_feat);
|
||||
feat->CopyFromVec(offset_);
|
||||
feat->AddMatVec(1.0, linear_term_, kNoTrans, input_feat, 1.0);
|
||||
}
|
||||
|
||||
void OnlineTransform::GetFrames(
|
||||
const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
|
||||
KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
|
||||
int32 num_frames = feats->NumRows(),
|
||||
input_dim = linear_term_.NumCols();
|
||||
Matrix<BaseFloat> input_feats(num_frames, input_dim, kUndefined);
|
||||
src_->GetFrames(frames, &input_feats);
|
||||
feats->CopyRowsFromVec(offset_);
|
||||
feats->AddMatMat(1.0, input_feats, kNoTrans, linear_term_, kTrans, 1.0);
|
||||
}
|
||||
|
||||
|
||||
int32 OnlineDeltaFeature::Dim() const {
|
||||
int32 src_dim = src_->Dim();
|
||||
return src_dim * (1 + opts_.order);
|
||||
}
|
||||
|
||||
int32 OnlineDeltaFeature::NumFramesReady() const {
|
||||
int32 num_frames = src_->NumFramesReady(),
|
||||
context = opts_.order * opts_.window;
|
||||
// "context" is the number of frames on the left or (more relevant
|
||||
// here) right which we need in order to produce the output.
|
||||
if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
|
||||
return num_frames;
|
||||
else
|
||||
return std::max<int32>(0, num_frames - context);
|
||||
}
|
||||
|
||||
void OnlineDeltaFeature::GetFrame(int32 frame,
|
||||
VectorBase<BaseFloat> *feat) {
|
||||
KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
|
||||
KALDI_ASSERT(feat->Dim() == Dim());
|
||||
// We'll produce a temporary matrix containing the features we want to
|
||||
// compute deltas on, but truncated to the necessary context.
|
||||
int32 context = opts_.order * opts_.window;
|
||||
int32 left_frame = frame - context,
|
||||
right_frame = frame + context,
|
||||
src_frames_ready = src_->NumFramesReady();
|
||||
if (left_frame < 0) left_frame = 0;
|
||||
if (right_frame >= src_frames_ready)
|
||||
right_frame = src_frames_ready - 1;
|
||||
KALDI_ASSERT(right_frame >= left_frame);
|
||||
int32 temp_num_frames = right_frame + 1 - left_frame,
|
||||
src_dim = src_->Dim();
|
||||
Matrix<BaseFloat> temp_src(temp_num_frames, src_dim);
|
||||
for (int32 t = left_frame; t <= right_frame; t++) {
|
||||
SubVector<BaseFloat> temp_row(temp_src, t - left_frame);
|
||||
src_->GetFrame(t, &temp_row);
|
||||
}
|
||||
int32 temp_t = frame - left_frame; // temp_t is the offset of frame "frame"
|
||||
// within temp_src
|
||||
delta_features_.Process(temp_src, temp_t, feat);
|
||||
}
|
||||
|
||||
|
||||
OnlineDeltaFeature::OnlineDeltaFeature(const DeltaFeaturesOptions &opts,
|
||||
OnlineFeatureInterface *src):
|
||||
src_(src), opts_(opts), delta_features_(opts) { }
|
||||
|
||||
void OnlineCacheFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
|
||||
KALDI_ASSERT(frame >= 0);
|
||||
if (static_cast<size_t>(frame) < cache_.size() && cache_[frame] != NULL) {
|
||||
feat->CopyFromVec(*(cache_[frame]));
|
||||
} else {
|
||||
if (static_cast<size_t>(frame) >= cache_.size())
|
||||
cache_.resize(frame + 1, NULL);
|
||||
int32 dim = this->Dim();
|
||||
cache_[frame] = new Vector<BaseFloat>(dim);
|
||||
// The following call will crash if frame "frame" is not ready.
|
||||
src_->GetFrame(frame, cache_[frame]);
|
||||
feat->CopyFromVec(*(cache_[frame]));
|
||||
}
|
||||
}
|
||||
|
||||
void OnlineCacheFeature::GetFrames(
|
||||
const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
|
||||
int32 num_frames = frames.size();
|
||||
// non_cached_frames will be the subset of 't' values in 'frames' which were
|
||||
// not previously cached, which we therefore need to get from src_.
|
||||
std::vector<int32> non_cached_frames;
|
||||
// 'non_cached_indexes' stores the indexes 'i' into 'frames' corresponding to
|
||||
// the corresponding frames in 'non_cached_frames'.
|
||||
std::vector<int32> non_cached_indexes;
|
||||
non_cached_frames.reserve(frames.size());
|
||||
non_cached_indexes.reserve(frames.size());
|
||||
for (int32 i = 0; i < num_frames; i++) {
|
||||
int32 t = frames[i];
|
||||
if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
|
||||
feats->Row(i).CopyFromVec(*(cache_[t]));
|
||||
} else {
|
||||
non_cached_frames.push_back(t);
|
||||
non_cached_indexes.push_back(i);
|
||||
}
|
||||
}
|
||||
if (non_cached_frames.empty())
|
||||
return;
|
||||
int32 num_non_cached_frames = non_cached_frames.size(),
|
||||
dim = this->Dim();
|
||||
Matrix<BaseFloat> non_cached_feats(num_non_cached_frames, dim,
|
||||
kUndefined);
|
||||
src_->GetFrames(non_cached_frames, &non_cached_feats);
|
||||
for (int32 i = 0; i < num_non_cached_frames; i++) {
|
||||
int32 t = non_cached_frames[i];
|
||||
if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
|
||||
// We can reach this point due to repeat indexes in 'non_cached_frames'.
|
||||
feats->Row(non_cached_indexes[i]).CopyFromVec(*(cache_[t]));
|
||||
} else {
|
||||
SubVector<BaseFloat> this_feat(non_cached_feats, i);
|
||||
feats->Row(non_cached_indexes[i]).CopyFromVec(this_feat);
|
||||
if (static_cast<size_t>(t) >= cache_.size())
|
||||
cache_.resize(t + 1, NULL);
|
||||
cache_[t] = new Vector<BaseFloat>(this_feat);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void OnlineCacheFeature::ClearCache() {
|
||||
for (size_t i = 0; i < cache_.size(); i++)
|
||||
delete cache_[i];
|
||||
cache_.resize(0);
|
||||
}
|
||||
|
||||
|
||||
void OnlineAppendFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
|
||||
KALDI_ASSERT(feat->Dim() == Dim());
|
||||
|
||||
SubVector<BaseFloat> feat1(*feat, 0, src1_->Dim());
|
||||
SubVector<BaseFloat> feat2(*feat, src1_->Dim(), src2_->Dim());
|
||||
src1_->GetFrame(frame, &feat1);
|
||||
src2_->GetFrame(frame, &feat2);
|
||||
};
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,632 +0,0 @@
|
||||
// feat/online-feature.h
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2014 Yanqing Sun, Junjie Wang,
|
||||
// Daniel Povey, Korbinian Riedhammer
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef KALDI_FEAT_ONLINE_FEATURE_H_
|
||||
#define KALDI_FEAT_ONLINE_FEATURE_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
|
||||
#include "matrix/matrix-lib.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "base/kaldi-error.h"
|
||||
#include "feat/feature-functions.h"
|
||||
#include "feat/feature-mfcc.h"
|
||||
#include "feat/feature-plp.h"
|
||||
#include "feat/feature-fbank.h"
|
||||
#include "feat/online-feature-itf.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup onlinefeat OnlineFeatureExtraction
|
||||
/// @{
|
||||
|
||||
|
||||
/// This class serves as a storage for feature vectors with an option to limit
|
||||
/// the memory usage by removing old elements. The deleted frames indices are
|
||||
/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
|
||||
/// provides the indices as if no deletion was being performed.
|
||||
/// This is useful when processing very long recordings which would otherwise
|
||||
/// cause the memory to eventually blow up when the features are not being removed.
|
||||
class RecyclingVector {
|
||||
public:
|
||||
/// By default it does not remove any elements.
|
||||
RecyclingVector(int items_to_hold = -1);
|
||||
|
||||
/// The ownership is being retained by this collection - do not delete the item.
|
||||
Vector<BaseFloat> *At(int index) const;
|
||||
|
||||
/// The ownership of the item is passed to this collection - do not delete the item.
|
||||
void PushBack(Vector<BaseFloat> *item);
|
||||
|
||||
/// This method returns the size as if no "recycling" had happened,
|
||||
/// i.e. equivalent to the number of times the PushBack method has been called.
|
||||
int Size() const;
|
||||
|
||||
~RecyclingVector();
|
||||
|
||||
private:
|
||||
std::deque<Vector<BaseFloat>*> items_;
|
||||
int items_to_hold_;
|
||||
int first_available_index_;
|
||||
};
|
||||
|
||||
|
||||
/// This is a templated class for online feature extraction;
|
||||
/// it's templated on a class like MfccComputer or PlpComputer
|
||||
/// that does the basic feature extraction.
|
||||
template<class C>
|
||||
class OnlineGenericBaseFeature: public OnlineBaseFeature {
|
||||
public:
|
||||
//
|
||||
// First, functions that are present in the interface:
|
||||
//
|
||||
virtual int32 Dim() const { return computer_.Dim(); }
|
||||
|
||||
// Note: IsLastFrame() will only ever return true if you have called
|
||||
// InputFinished() (and this frame is the last frame).
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return input_finished_ && frame == NumFramesReady() - 1;
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const { return features_.Size(); }
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
// Next, functions that are not in the interface.
|
||||
|
||||
|
||||
// Constructor from options class
|
||||
explicit OnlineGenericBaseFeature(const typename C::Options &opts);
|
||||
|
||||
// This would be called from the application, when you get
|
||||
// more wave data. Note: the sampling_rate is only provided so
|
||||
// the code can assert that it matches the sampling rate
|
||||
// expected in the options.
|
||||
virtual void AcceptWaveform(BaseFloat sampling_rate,
|
||||
const VectorBase<BaseFloat> &waveform);
|
||||
|
||||
|
||||
// InputFinished() tells the class you won't be providing any
|
||||
// more waveform. This will help flush out the last frame or two
|
||||
// of features, in the case where snip-edges == false; it also
|
||||
// affects the return value of IsLastFrame().
|
||||
virtual void InputFinished();
|
||||
|
||||
private:
|
||||
// This function computes any additional feature frames that it is possible to
|
||||
// compute from 'waveform_remainder_', which at this point may contain more
|
||||
// than just a remainder-sized quantity (because AcceptWaveform() appends to
|
||||
// waveform_remainder_ before calling this function). It adds these feature
|
||||
// frames to features_, and shifts off any now-unneeded samples of input from
|
||||
// waveform_remainder_ while incrementing waveform_offset_ by the same amount.
|
||||
void ComputeFeatures();
|
||||
|
||||
void MaybeCreateResampler(BaseFloat sampling_rate);
|
||||
|
||||
C computer_; // class that does the MFCC or PLP or filterbank computation
|
||||
|
||||
// resampler in cases when the input sampling frequency is not equal to
|
||||
// the expected sampling rate
|
||||
std::unique_ptr<LinearResample> resampler_;
|
||||
|
||||
FeatureWindowFunction window_function_;
|
||||
|
||||
// features_ is the Mfcc or Plp or Fbank features that we have already computed.
|
||||
|
||||
RecyclingVector features_;
|
||||
|
||||
// True if the user has called "InputFinished()"
|
||||
bool input_finished_;
|
||||
|
||||
// The sampling frequency, extracted from the config. Should
|
||||
// be identical to the waveform supplied.
|
||||
BaseFloat sampling_frequency_;
|
||||
|
||||
// waveform_offset_ is the number of samples of waveform that we have
|
||||
// already discarded, i.e. that were prior to 'waveform_remainder_'.
|
||||
int64 waveform_offset_;
|
||||
|
||||
// waveform_remainder_ is a short piece of waveform that we may need to keep
|
||||
// after extracting all the whole frames we can (whatever length of feature
|
||||
// will be required for the next phase of computation).
|
||||
Vector<BaseFloat> waveform_remainder_;
|
||||
};
|
||||
|
||||
typedef OnlineGenericBaseFeature<MfccComputer> OnlineMfcc;
|
||||
typedef OnlineGenericBaseFeature<PlpComputer> OnlinePlp;
|
||||
typedef OnlineGenericBaseFeature<FbankComputer> OnlineFbank;
|
||||
|
||||
|
||||
/// This class takes a Matrix<BaseFloat> and wraps it as an
|
||||
/// OnlineFeatureInterface: this can be useful where some earlier stage of
|
||||
/// feature processing has been done offline but you want to use part of the
|
||||
/// online pipeline.
|
||||
class OnlineMatrixFeature: public OnlineFeatureInterface {
|
||||
public:
|
||||
/// Caution: this class maintains the const reference from the constructor, so
|
||||
/// don't let it go out of scope while this object exists.
|
||||
explicit OnlineMatrixFeature(const MatrixBase<BaseFloat> &mat): mat_(mat) { }
|
||||
|
||||
virtual int32 Dim() const { return mat_.NumCols(); }
|
||||
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return 0.01f;
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const { return mat_.NumRows(); }
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
|
||||
feat->CopyFromVec(mat_.Row(frame));
|
||||
}
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return (frame + 1 == mat_.NumRows());
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
const MatrixBase<BaseFloat> &mat_;
|
||||
};
|
||||
|
||||
|
||||
// Note the similarity with SlidingWindowCmnOptions, but there
|
||||
// are also differences. One which doesn't appear in the config
|
||||
// itself, because it's a difference between the setups, is that
|
||||
// in OnlineCmn, we carry over data from the previous utterance,
|
||||
// or, if no previous utterance is available, from global stats,
|
||||
// or, if previous utterances are available but the total amount
|
||||
// of data is less than prev_frames, we pad with up to "global_frames"
|
||||
// frames from the global stats.
|
||||
struct OnlineCmvnOptions {
|
||||
int32 cmn_window;
|
||||
int32 speaker_frames; // must be <= cmn_window
|
||||
int32 global_frames; // must be <= speaker_frames.
|
||||
bool normalize_mean; // Must be true if normalize_variance==true.
|
||||
bool normalize_variance;
|
||||
|
||||
int32 modulus; // not configurable from command line, relates to how the
|
||||
// class computes the cmvn internally. smaller->more
|
||||
// time-efficient but less memory-efficient. Must be >= 1.
|
||||
int32 ring_buffer_size; // not configurable from command line; size of ring
|
||||
// buffer used for caching CMVN stats. Must be >=
|
||||
// modulus.
|
||||
std::string skip_dims; // Colon-separated list of dimensions to skip normalization
|
||||
// of, e.g. 13:14:15.
|
||||
|
||||
OnlineCmvnOptions():
|
||||
cmn_window(600),
|
||||
speaker_frames(600),
|
||||
global_frames(200),
|
||||
normalize_mean(true),
|
||||
normalize_variance(false),
|
||||
modulus(20),
|
||||
ring_buffer_size(20),
|
||||
skip_dims("") { }
|
||||
|
||||
void Check() const {
|
||||
KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
|
||||
&& modulus > 0);
|
||||
}
|
||||
|
||||
void Register(ParseOptions *po) {
|
||||
po->Register("cmn-window", &cmn_window, "Number of frames of sliding "
|
||||
"context for cepstral mean normalization.");
|
||||
po->Register("global-frames", &global_frames, "Number of frames of "
|
||||
"global-average cepstral mean normalization stats to use for "
|
||||
"first utterance of a speaker");
|
||||
po->Register("speaker-frames", &speaker_frames, "Number of frames of "
|
||||
"previous utterance(s) from this speaker to use in cepstral "
|
||||
"mean normalization");
|
||||
// we name the config string "norm-vars" for compatibility with
|
||||
// ../featbin/apply-cmvn.cc
|
||||
po->Register("norm-vars", &normalize_variance, "If true, do "
|
||||
"cepstral variance normalization in addition to cepstral mean "
|
||||
"normalization ");
|
||||
po->Register("norm-means", &normalize_mean, "If true, do mean normalization "
|
||||
"(note: you cannot normalize the variance but not the mean)");
|
||||
po->Register("skip-dims", &skip_dims, "Dimensions to skip normalization of "
|
||||
"(colon-separated list of integers)");}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/** Struct OnlineCmvnState stores the state of CMVN adaptation between
|
||||
utterances (but not the state of the computation within an utterance). It
|
||||
stores the global CMVN stats and the stats of the current speaker (if we
|
||||
have seen previous utterances for this speaker), and possibly will have a
|
||||
member "frozen_state": if the user has called the function Freeze() of class
|
||||
OnlineCmvn, to fix the CMVN so we can estimate fMLLR on top of the fixed
|
||||
value of cmvn. If nonempty, "frozen_state" will reflect how we were
|
||||
normalizing the mean and (if applicable) variance at the time when that
|
||||
function was called.
|
||||
*/
|
||||
struct OnlineCmvnState {
|
||||
// The following is the total CMVN stats for this speaker (up till now), in
|
||||
// the same format.
|
||||
Matrix<double> speaker_cmvn_stats;
|
||||
|
||||
// The following is the global CMVN stats, in the usual
|
||||
// format, of dimension 2 x (dim+1), as [ sum-stats count
|
||||
// sum-squared-stats 0 ]
|
||||
Matrix<double> global_cmvn_stats;
|
||||
|
||||
// If nonempty, contains CMVN stats representing the "frozen" state
|
||||
// of CMVN that reflects how we were normalizing the data when the
|
||||
// user called the Freeze() function in class OnlineCmvn.
|
||||
Matrix<double> frozen_state;
|
||||
|
||||
OnlineCmvnState() { }
|
||||
|
||||
explicit OnlineCmvnState(const Matrix<double> &global_stats):
|
||||
global_cmvn_stats(global_stats) { }
|
||||
|
||||
// Copy constructor
|
||||
OnlineCmvnState(const OnlineCmvnState &other);
|
||||
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
void Read(std::istream &is, bool binary);
|
||||
|
||||
// Use the default assignment operator.
|
||||
};
|
||||
|
||||
/**
|
||||
This class does an online version of the cepstral mean and [optionally]
|
||||
variance, but note that this is not equivalent to the offline version. This
|
||||
is necessarily so, as the offline computation involves looking into the
|
||||
future. If you plan to use features normalized with this type of CMVN then
|
||||
you need to train in a `matched' way, i.e. with the same type of features.
|
||||
We normally only do so in the "online" GMM-based decoding, e.g. in
|
||||
online2bin/online2-wav-gmm-latgen-faster.cc; see also the script
|
||||
steps/online/prepare_online_decoding.sh and steps/online/decode.sh.
|
||||
|
||||
In the steady state (in the middle of a long utterance), this class
|
||||
accumulates CMVN statistics from the previous "cmn_window" frames (default 600
|
||||
frames, or 6 seconds), and uses these to normalize the mean and possibly
|
||||
variance of the current frame.
|
||||
|
||||
The config variables "speaker_frames" and "global_frames" relate to what
|
||||
happens at the beginning of the utterance when we have seen fewer than
|
||||
"cmn_window" frames of context, and so might not have very good stats to
|
||||
normalize with. Basically, we first augment any existing stats with up
|
||||
to "speaker_frames" frames of stats from previous utterances of the current
|
||||
speaker, and if this doesn't take us up to the required "cmn_window" frame
|
||||
count, we further augment with up to "global_frames" frames of global
|
||||
stats. The global stats are CMVN stats accumulated from training or testing
|
||||
data, that give us a reasonable source of mean and variance for "typical"
|
||||
data.
|
||||
*/
|
||||
class OnlineCmvn: public OnlineFeatureInterface {
|
||||
public:
|
||||
|
||||
//
|
||||
// First, functions that are present in the interface:
|
||||
//
|
||||
virtual int32 Dim() const { return src_->Dim(); }
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return src_->IsLastFrame(frame);
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
// The online cmvn does not introduce any additional latency.
|
||||
virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
//
|
||||
// Next, functions that are not in the interface.
|
||||
//
|
||||
|
||||
/// Initializer that sets the cmvn state. If you don't have previous
|
||||
/// utterances from the same speaker you are supposed to initialize the CMVN
|
||||
/// state from some global CMVN stats, which you can get from summing all cmvn
|
||||
/// stats you have in your training data using "sum-matrix". This just gives
|
||||
/// it a reasonable starting point at the start of the file.
|
||||
/// If you do have previous utterances from the same speaker or at least a
|
||||
/// similar environment, you are supposed to initialize it by calling GetState
|
||||
/// from the previous utterance
|
||||
OnlineCmvn(const OnlineCmvnOptions &opts,
|
||||
const OnlineCmvnState &cmvn_state,
|
||||
OnlineFeatureInterface *src);
|
||||
|
||||
/// Initializer that does not set the cmvn state:
|
||||
/// after calling this, you should call SetState().
|
||||
OnlineCmvn(const OnlineCmvnOptions &opts,
|
||||
OnlineFeatureInterface *src);
|
||||
|
||||
// Outputs any state information from this utterance to "cmvn_state".
|
||||
// The value of "cmvn_state" before the call does not matter: the output
|
||||
// depends on the value of OnlineCmvnState the class was initialized
|
||||
// with, the input feature values up to cur_frame, and the effects
|
||||
// of the user possibly having called Freeze().
|
||||
// If cur_frame is -1, it will just output the unmodified original
|
||||
// state that was supplied to this object.
|
||||
void GetState(int32 cur_frame,
|
||||
OnlineCmvnState *cmvn_state);
|
||||
|
||||
// This function can be used to modify the state of the CMVN computation
|
||||
// from outside, but must only be called before you have processed any data
|
||||
// (otherwise it will crash). This "state" is really just the information
|
||||
// that is propagated between utterances, not the state of the computation
|
||||
// inside an utterance.
|
||||
void SetState(const OnlineCmvnState &cmvn_state);
|
||||
|
||||
// From this point it will freeze the CMN to what it would have been if
|
||||
// measured at frame "cur_frame", and it will stop it from changing
|
||||
// further. This also applies retroactively for this utterance, so if you
|
||||
// call GetFrame() on previous frames, it will use the CMVN stats
|
||||
// from cur_frame; and it applies in the future too if you then
|
||||
// call OutputState() and use this state to initialize the next
|
||||
// utterance's CMVN object.
|
||||
void Freeze(int32 cur_frame);
|
||||
|
||||
virtual ~OnlineCmvn();
|
||||
private:
|
||||
|
||||
/// Smooth the CMVN stats "stats" (which are stored in the normal format as a
|
||||
/// 2 x (dim+1) matrix), by possibly adding some stats from "global_stats"
|
||||
/// and/or "speaker_stats", controlled by the config. The best way to
|
||||
/// understand the smoothing rule we use is just to look at the code.
|
||||
static void SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
|
||||
const MatrixBase<double> &global_stats,
|
||||
const OnlineCmvnOptions &opts,
|
||||
MatrixBase<double> *stats);
|
||||
|
||||
/// Get the most recent cached frame of CMVN stats. [If no frames
|
||||
/// were cached, sets up empty stats for frame zero and returns that].
|
||||
void GetMostRecentCachedFrame(int32 frame,
|
||||
int32 *cached_frame,
|
||||
MatrixBase<double> *stats);
|
||||
|
||||
/// Cache this frame of stats.
|
||||
void CacheFrame(int32 frame, const MatrixBase<double> &stats);
|
||||
|
||||
/// Initialize ring buffer for caching stats.
|
||||
inline void InitRingBufferIfNeeded();
|
||||
|
||||
/// Computes the raw CMVN stats for this frame, making use of (and updating if
|
||||
/// necessary) the cached statistics in raw_stats_. This means the (x,
|
||||
/// x^2, count) stats for the last up to opts_.cmn_window frames.
|
||||
void ComputeStatsForFrame(int32 frame,
|
||||
MatrixBase<double> *stats);
|
||||
|
||||
|
||||
OnlineCmvnOptions opts_;
|
||||
std::vector<int32> skip_dims_; // Skip CMVN for these dimensions. Derived from opts_.
|
||||
OnlineCmvnState orig_state_; // reflects the state before we saw this
|
||||
// utterance.
|
||||
Matrix<double> frozen_state_; // If the user called Freeze(), this variable
|
||||
// will reflect the CMVN state that we froze
|
||||
// at.
|
||||
|
||||
// The variable below reflects the raw (count, x, x^2) statistics of the
|
||||
// input, computed every opts_.modulus frames. raw_stats_[n / opts_.modulus]
|
||||
// contains the (count, x, x^2) statistics for the frames from
|
||||
// std::max(0, n - opts_.cmn_window) through n.
|
||||
std::vector<Matrix<double>*> cached_stats_modulo_;
|
||||
// the variable below is a ring-buffer of cached stats. the int32 is the
|
||||
// frame index.
|
||||
std::vector<std::pair<int32, Matrix<double> > > cached_stats_ring_;
|
||||
|
||||
// Some temporary variables used inside functions of this class, which
|
||||
// put here to avoid reallocation.
|
||||
Matrix<double> temp_stats_;
|
||||
Vector<BaseFloat> temp_feats_;
|
||||
Vector<double> temp_feats_dbl_;
|
||||
|
||||
OnlineFeatureInterface *src_; // Not owned here
|
||||
};
|
||||
|
||||
|
||||
struct OnlineSpliceOptions {
|
||||
int32 left_context;
|
||||
int32 right_context;
|
||||
OnlineSpliceOptions(): left_context(4), right_context(4) { }
|
||||
void Register(ParseOptions *po) {
|
||||
po->Register("left-context", &left_context, "Left-context for frame "
|
||||
"splicing prior to LDA");
|
||||
po->Register("right-context", &right_context, "Right-context for frame "
|
||||
"splicing prior to LDA");
|
||||
}
|
||||
};
|
||||
|
||||
class OnlineSpliceFrames: public OnlineFeatureInterface {
|
||||
public:
|
||||
//
|
||||
// First, functions that are present in the interface:
|
||||
//
|
||||
virtual int32 Dim() const {
|
||||
return src_->Dim() * (1 + left_context_ + right_context_);
|
||||
}
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return src_->IsLastFrame(frame);
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const;
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
//
|
||||
// Next, functions that are not in the interface.
|
||||
//
|
||||
OnlineSpliceFrames(const OnlineSpliceOptions &opts,
|
||||
OnlineFeatureInterface *src):
|
||||
left_context_(opts.left_context), right_context_(opts.right_context),
|
||||
src_(src) { }
|
||||
|
||||
private:
|
||||
int32 left_context_;
|
||||
int32 right_context_;
|
||||
OnlineFeatureInterface *src_; // Not owned here
|
||||
};
|
||||
|
||||
/// This online-feature class implements any affine or linear transform.
|
||||
class OnlineTransform: public OnlineFeatureInterface {
|
||||
public:
|
||||
//
|
||||
// First, functions that are present in the interface:
|
||||
//
|
||||
virtual int32 Dim() const { return offset_.Dim(); }
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return src_->IsLastFrame(frame);
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
virtual void GetFrames(const std::vector<int32> &frames,
|
||||
MatrixBase<BaseFloat> *feats);
|
||||
|
||||
//
|
||||
// Next, functions that are not in the interface.
|
||||
//
|
||||
|
||||
/// The transform can be a linear transform, or an affine transform
|
||||
/// where the last column is the offset.
|
||||
OnlineTransform(const MatrixBase<BaseFloat> &transform,
|
||||
OnlineFeatureInterface *src);
|
||||
|
||||
|
||||
private:
|
||||
OnlineFeatureInterface *src_; // Not owned here
|
||||
Matrix<BaseFloat> linear_term_;
|
||||
Vector<BaseFloat> offset_;
|
||||
};
|
||||
|
||||
class OnlineDeltaFeature: public OnlineFeatureInterface {
|
||||
public:
|
||||
//
|
||||
// First, functions that are present in the interface:
|
||||
//
|
||||
virtual int32 Dim() const;
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return src_->IsLastFrame(frame);
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const;
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
//
|
||||
// Next, functions that are not in the interface.
|
||||
//
|
||||
OnlineDeltaFeature(const DeltaFeaturesOptions &opts,
|
||||
OnlineFeatureInterface *src);
|
||||
|
||||
private:
|
||||
OnlineFeatureInterface *src_; // Not owned here
|
||||
DeltaFeaturesOptions opts_;
|
||||
DeltaFeatures delta_features_; // This class contains just a few
|
||||
// coefficients.
|
||||
};
|
||||
|
||||
|
||||
/// This feature type can be used to cache its input, to avoid
|
||||
/// repetition of computation in a multi-pass decoding context.
|
||||
class OnlineCacheFeature: public OnlineFeatureInterface {
|
||||
public:
|
||||
virtual int32 Dim() const { return src_->Dim(); }
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return src_->IsLastFrame(frame);
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
virtual void GetFrames(const std::vector<int32> &frames,
|
||||
MatrixBase<BaseFloat> *feats);
|
||||
|
||||
virtual ~OnlineCacheFeature() { ClearCache(); }
|
||||
|
||||
// Things that are not in the shared interface:
|
||||
|
||||
void ClearCache(); // this should be called if you change the underlying
|
||||
// features in some way.
|
||||
|
||||
explicit OnlineCacheFeature(OnlineFeatureInterface *src): src_(src) { }
|
||||
private:
|
||||
|
||||
OnlineFeatureInterface *src_; // Not owned here
|
||||
std::vector<Vector<BaseFloat>* > cache_;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/// This online-feature class implements combination of two feature
|
||||
/// streams (such as pitch, plp) into one stream.
|
||||
class OnlineAppendFeature: public OnlineFeatureInterface {
|
||||
public:
|
||||
virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); }
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame));
|
||||
}
|
||||
// Hopefully sources have the same rate
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src1_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const {
|
||||
return std::min(src1_->NumFramesReady(), src2_->NumFramesReady());
|
||||
}
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
virtual ~OnlineAppendFeature() { }
|
||||
|
||||
OnlineAppendFeature(OnlineFeatureInterface *src1,
|
||||
OnlineFeatureInterface *src2): src1_(src1), src2_(src2) { }
|
||||
private:
|
||||
|
||||
OnlineFeatureInterface *src1_;
|
||||
OnlineFeatureInterface *src2_;
|
||||
};
|
||||
|
||||
/// @} End of "addtogroup onlinefeat"
|
||||
} // namespace kaldi
|
||||
|
||||
#endif // KALDI_FEAT_ONLINE_FEATURE_H_
|
File diff suppressed because it is too large
Load Diff
@ -1,450 +0,0 @@
|
||||
// feat/pitch-functions.h
|
||||
|
||||
// Copyright 2013 Pegah Ghahremani
|
||||
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
// 2014 Yanqing Sun, Junjie Wang,
|
||||
// Daniel Povey, Korbinian Riedhammer
|
||||
// Xin Lei
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
|
||||
#define KALDI_FEAT_PITCH_FUNCTIONS_H_
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "base/kaldi-error.h"
|
||||
#include "feat/mel-computations.h"
|
||||
#include "feat/online-feature-itf.h"
|
||||
#include "matrix/matrix-lib.h"
|
||||
#include "util/common-utils.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
struct PitchExtractionOptions {
|
||||
// FrameExtractionOptions frame_opts;
|
||||
BaseFloat samp_freq; // sample frequency in hertz
|
||||
BaseFloat frame_shift_ms; // in milliseconds.
|
||||
BaseFloat frame_length_ms; // in milliseconds.
|
||||
BaseFloat preemph_coeff; // Preemphasis coefficient. [use is deprecated.]
|
||||
BaseFloat min_f0; // min f0 to search (Hz)
|
||||
BaseFloat max_f0; // max f0 to search (Hz)
|
||||
BaseFloat soft_min_f0; // Minimum f0, applied in soft way, must not
|
||||
// exceed min-f0
|
||||
BaseFloat penalty_factor; // cost factor for FO change
|
||||
BaseFloat lowpass_cutoff; // cutoff frequency for Low pass filter
|
||||
BaseFloat resample_freq; // Integer that determines filter width when
|
||||
// upsampling NCCF
|
||||
BaseFloat delta_pitch; // the pitch tolerance in pruning lags
|
||||
BaseFloat nccf_ballast; // Increasing this factor reduces NCCF for
|
||||
// quiet frames, helping ensure pitch
|
||||
// continuity in unvoiced region
|
||||
int32 lowpass_filter_width; // Integer that determines filter width of
|
||||
// lowpass filter
|
||||
int32 upsample_filter_width; // Integer that determines filter width when
|
||||
// upsampling NCCF
|
||||
|
||||
// Below are newer config variables, not present in the original paper,
|
||||
// that relate to the online pitch extraction algorithm.
|
||||
|
||||
// The maximum number of frames of latency that we allow the pitch-processing
|
||||
// to introduce, for online operation. If you set this to a large value,
|
||||
// there would be no inaccuracy from the Viterbi traceback (but it might make
|
||||
// you wait to see the pitch). This is not very relevant for the online
|
||||
// operation: normalization-right-context is more relevant, you
|
||||
// can just leave this value at zero.
|
||||
int32 max_frames_latency;
|
||||
|
||||
// Only relevant for the function ComputeKaldiPitch which is called by
|
||||
// compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
|
||||
// this size. This affects the energy normalization which has a small effect
|
||||
// on the resulting features, especially at the beginning of a file. For best
|
||||
// compatibility with online operation (e.g. if you plan to train models for
|
||||
// the online-deocding setup), you might want to set this to a small value,
|
||||
// like one frame.
|
||||
int32 frames_per_chunk;
|
||||
|
||||
// Only relevant for the function ComputeKaldiPitch which is called by
|
||||
// compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
|
||||
// nonzero. If true, it will query the features as soon as they are
|
||||
// available, which simulates the first-pass features you would get in online
|
||||
// decoding. If false, the features you will get will be the same as those
|
||||
// available at the end of the utterance, after InputFinished() has been
|
||||
// called: e.g. during lattice rescoring.
|
||||
bool simulate_first_pass_online;
|
||||
|
||||
// Only relevant for online operation or when emulating online operation
|
||||
// (e.g. when setting frames_per_chunk). This is the frame-index on which we
|
||||
// recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
|
||||
// segment ends before this we do it when the segment ends. We do this by
|
||||
// re-computing the signal average energy, which affects the NCCF via the
|
||||
// "ballast term", scaling the resampled NCCF by a factor derived from the
|
||||
// average change in the "ballast term", and re-doing the backtrace
|
||||
// computation. Making this infinity would be the most exact, but would
|
||||
// introduce unwanted latency at the end of long utterances, for little
|
||||
// benefit.
|
||||
int32 recompute_frame;
|
||||
|
||||
// This is a "hidden config" used only for testing the online pitch
|
||||
// extraction. If true, we compute the signal root-mean-squared for the
|
||||
// ballast term, only up to the current frame, rather than the end of the
|
||||
// current chunk of signal. This makes the output insensitive to the
|
||||
// chunking, which is useful for testing purposes.
|
||||
bool nccf_ballast_online;
|
||||
bool snip_edges;
|
||||
PitchExtractionOptions():
|
||||
samp_freq(16000),
|
||||
frame_shift_ms(10.0),
|
||||
frame_length_ms(25.0),
|
||||
preemph_coeff(0.0),
|
||||
min_f0(50),
|
||||
max_f0(400),
|
||||
soft_min_f0(10.0),
|
||||
penalty_factor(0.1),
|
||||
lowpass_cutoff(1000),
|
||||
resample_freq(4000),
|
||||
delta_pitch(0.005),
|
||||
nccf_ballast(7000),
|
||||
lowpass_filter_width(1),
|
||||
upsample_filter_width(5),
|
||||
max_frames_latency(0),
|
||||
frames_per_chunk(0),
|
||||
simulate_first_pass_online(false),
|
||||
recompute_frame(500),
|
||||
nccf_ballast_online(false),
|
||||
snip_edges(true) { }
|
||||
|
||||
void Register(OptionsItf *opts) {
|
||||
opts->Register("sample-frequency", &samp_freq,
|
||||
"Waveform data sample frequency (must match the waveform "
|
||||
"file, if specified there)");
|
||||
opts->Register("frame-length", &frame_length_ms, "Frame length in "
|
||||
"milliseconds");
|
||||
opts->Register("frame-shift", &frame_shift_ms, "Frame shift in "
|
||||
"milliseconds");
|
||||
opts->Register("preemphasis-coefficient", &preemph_coeff,
|
||||
"Coefficient for use in signal preemphasis (deprecated)");
|
||||
opts->Register("min-f0", &min_f0,
|
||||
"min. F0 to search for (Hz)");
|
||||
opts->Register("max-f0", &max_f0,
|
||||
"max. F0 to search for (Hz)");
|
||||
opts->Register("soft-min-f0", &soft_min_f0,
|
||||
"Minimum f0, applied in soft way, must not exceed min-f0");
|
||||
opts->Register("penalty-factor", &penalty_factor,
|
||||
"cost factor for FO change.");
|
||||
opts->Register("lowpass-cutoff", &lowpass_cutoff,
|
||||
"cutoff frequency for LowPass filter (Hz) ");
|
||||
opts->Register("resample-frequency", &resample_freq,
|
||||
"Frequency that we down-sample the signal to. Must be "
|
||||
"more than twice lowpass-cutoff");
|
||||
opts->Register("delta-pitch", &delta_pitch,
|
||||
"Smallest relative change in pitch that our algorithm "
|
||||
"measures");
|
||||
opts->Register("nccf-ballast", &nccf_ballast,
|
||||
"Increasing this factor reduces NCCF for quiet frames");
|
||||
opts->Register("nccf-ballast-online", &nccf_ballast_online,
|
||||
"This is useful mainly for debug; it affects how the NCCF "
|
||||
"ballast is computed.");
|
||||
opts->Register("lowpass-filter-width", &lowpass_filter_width,
|
||||
"Integer that determines filter width of "
|
||||
"lowpass filter, more gives sharper filter");
|
||||
opts->Register("upsample-filter-width", &upsample_filter_width,
|
||||
"Integer that determines filter width when upsampling NCCF");
|
||||
opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
|
||||
"offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
|
||||
"you can set it to a small nonzero value, such as 10, for "
|
||||
"better feature compatibility with online decoding (affects "
|
||||
"energy normalization in the algorithm)");
|
||||
opts->Register("simulate-first-pass-online", &simulate_first_pass_online,
|
||||
"If true, compute-kaldi-pitch-feats will output features "
|
||||
"that correspond to what an online decoder would see in the "
|
||||
"first pass of decoding-- not the final version of the "
|
||||
"features, which is the default. Relevant if "
|
||||
"--frames-per-chunk > 0");
|
||||
opts->Register("recompute-frame", &recompute_frame, "Only relevant for "
|
||||
"online pitch extraction, or for compatibility with online "
|
||||
"pitch extraction. A non-critical parameter; the frame at "
|
||||
"which we recompute some of the forward pointers, after "
|
||||
"revising our estimate of the signal energy. Relevant if"
|
||||
"--frames-per-chunk > 0");
|
||||
opts->Register("max-frames-latency", &max_frames_latency, "Maximum number "
|
||||
"of frames of latency that we allow pitch tracking to "
|
||||
"introduce into the feature processing (affects output only "
|
||||
"if --frames-per-chunk > 0 and "
|
||||
"--simulate-first-pass-online=true");
|
||||
opts->Register("snip-edges", &snip_edges, "If this is set to false, the "
|
||||
"incomplete frames near the ending edge won't be snipped, "
|
||||
"so that the number of frames is the file size divided by "
|
||||
"the frame-shift. This makes different types of features "
|
||||
"give the same number of frames.");
|
||||
}
|
||||
/// Returns the window-size in samples, after resampling. This is the
|
||||
/// "basic window size", not the full window size after extending by max-lag.
|
||||
// Because of floating point representation, it is more reliable to divide
|
||||
// by 1000 instead of multiplying by 0.001, but it is a bit slower.
|
||||
int32 NccfWindowSize() const {
|
||||
return static_cast<int32>(resample_freq * frame_length_ms / 1000.0);
|
||||
}
|
||||
/// Returns the window-shift in samples, after resampling.
|
||||
int32 NccfWindowShift() const {
|
||||
return static_cast<int32>(resample_freq * frame_shift_ms / 1000.0);
|
||||
}
|
||||
};
|
||||
|
||||
struct ProcessPitchOptions {
|
||||
BaseFloat pitch_scale; // the final normalized-log-pitch feature is scaled
|
||||
// with this value
|
||||
BaseFloat pov_scale; // the final POV feature is scaled with this value
|
||||
BaseFloat pov_offset; // An offset that can be added to the final POV
|
||||
// feature (useful for online-decoding, where we don't
|
||||
// do CMN to the pitch-derived features.
|
||||
|
||||
BaseFloat delta_pitch_scale;
|
||||
BaseFloat delta_pitch_noise_stddev; // stddev of noise we add to delta-pitch
|
||||
int32 normalization_left_context; // left-context used for sliding-window
|
||||
// normalization
|
||||
int32 normalization_right_context; // this should be reduced in online
|
||||
// decoding to reduce latency
|
||||
|
||||
int32 delta_window;
|
||||
int32 delay;
|
||||
|
||||
bool add_pov_feature;
|
||||
bool add_normalized_log_pitch;
|
||||
bool add_delta_pitch;
|
||||
bool add_raw_log_pitch;
|
||||
|
||||
ProcessPitchOptions() :
|
||||
pitch_scale(2.0),
|
||||
pov_scale(2.0),
|
||||
pov_offset(0.0),
|
||||
delta_pitch_scale(10.0),
|
||||
delta_pitch_noise_stddev(0.005),
|
||||
normalization_left_context(75),
|
||||
normalization_right_context(75),
|
||||
delta_window(2),
|
||||
delay(0),
|
||||
add_pov_feature(true),
|
||||
add_normalized_log_pitch(true),
|
||||
add_delta_pitch(true),
|
||||
add_raw_log_pitch(false) { }
|
||||
|
||||
|
||||
void Register(ParseOptions *opts) {
|
||||
opts->Register("pitch-scale", &pitch_scale,
|
||||
"Scaling factor for the final normalized log-pitch value");
|
||||
opts->Register("pov-scale", &pov_scale,
|
||||
"Scaling factor for final POV (probability of voicing) "
|
||||
"feature");
|
||||
opts->Register("pov-offset", &pov_offset,
|
||||
"This can be used to add an offset to the POV feature. "
|
||||
"Intended for use in online decoding as a substitute for "
|
||||
" CMN.");
|
||||
opts->Register("delta-pitch-scale", &delta_pitch_scale,
|
||||
"Term to scale the final delta log-pitch feature");
|
||||
opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
|
||||
"Standard deviation for noise we add to the delta log-pitch "
|
||||
"(before scaling); should be about the same as delta-pitch "
|
||||
"option to pitch creation. The purpose is to get rid of "
|
||||
"peaks in the delta-pitch caused by discretization of pitch "
|
||||
"values.");
|
||||
opts->Register("normalization-left-context", &normalization_left_context,
|
||||
"Left-context (in frames) for moving window normalization");
|
||||
opts->Register("normalization-right-context", &normalization_right_context,
|
||||
"Right-context (in frames) for moving window normalization");
|
||||
opts->Register("delta-window", &delta_window,
|
||||
"Number of frames on each side of central frame, to use for "
|
||||
"delta window.");
|
||||
opts->Register("delay", &delay,
|
||||
"Number of frames by which the pitch information is "
|
||||
"delayed.");
|
||||
opts->Register("add-pov-feature", &add_pov_feature,
|
||||
"If true, the warped NCCF is added to output features");
|
||||
opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
|
||||
"If true, the log-pitch with POV-weighted mean subtraction "
|
||||
"over 1.5 second window is added to output features");
|
||||
opts->Register("add-delta-pitch", &add_delta_pitch,
|
||||
"If true, time derivative of log-pitch is added to output "
|
||||
"features");
|
||||
opts->Register("add-raw-log-pitch", &add_raw_log_pitch,
|
||||
"If true, log(pitch) is added to output features");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// We don't want to expose the pitch-extraction internals here as it's
|
||||
// quite complex, so we use a private implementation.
|
||||
class OnlinePitchFeatureImpl;
|
||||
|
||||
|
||||
// Note: to start on a new waveform, just construct a new version
|
||||
// of this object.
|
||||
class OnlinePitchFeature: public OnlineBaseFeature {
|
||||
public:
|
||||
explicit OnlinePitchFeature(const PitchExtractionOptions &opts);
|
||||
|
||||
virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }
|
||||
|
||||
virtual int32 NumFramesReady() const;
|
||||
|
||||
virtual BaseFloat FrameShiftInSeconds() const;
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const;
|
||||
|
||||
/// Outputs the two-dimensional feature consisting of (pitch, NCCF). You
|
||||
/// should probably post-process this using class OnlineProcessPitch.
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
virtual void AcceptWaveform(BaseFloat sampling_rate,
|
||||
const VectorBase<BaseFloat> &waveform);
|
||||
|
||||
virtual void InputFinished();
|
||||
|
||||
virtual ~OnlinePitchFeature();
|
||||
|
||||
private:
|
||||
OnlinePitchFeatureImpl *impl_;
|
||||
};
|
||||
|
||||
|
||||
/// This online-feature class implements post processing of pitch features.
|
||||
/// Inputs are original 2 dims (nccf, pitch). It can produce various
|
||||
/// kinds of outputs, using the default options it will be (pov-feature,
|
||||
/// normalized-log-pitch, delta-log-pitch).
|
||||
class OnlineProcessPitch: public OnlineFeatureInterface {
|
||||
public:
|
||||
virtual int32 Dim() const { return dim_; }
|
||||
|
||||
virtual bool IsLastFrame(int32 frame) const {
|
||||
if (frame <= -1)
|
||||
return src_->IsLastFrame(-1);
|
||||
else if (frame < opts_.delay)
|
||||
return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
|
||||
else
|
||||
return src_->IsLastFrame(frame - opts_.delay);
|
||||
}
|
||||
virtual BaseFloat FrameShiftInSeconds() const {
|
||||
return src_->FrameShiftInSeconds();
|
||||
}
|
||||
|
||||
virtual int32 NumFramesReady() const;
|
||||
|
||||
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
|
||||
|
||||
virtual ~OnlineProcessPitch() { }
|
||||
|
||||
// Does not take ownership of "src".
|
||||
OnlineProcessPitch(const ProcessPitchOptions &opts,
|
||||
OnlineFeatureInterface *src);
|
||||
|
||||
private:
|
||||
enum { kRawFeatureDim = 2}; // anonymous enum to define a constant.
|
||||
// kRawFeatureDim defines the dimension
|
||||
// of the input: (nccf, pitch)
|
||||
|
||||
ProcessPitchOptions opts_;
|
||||
OnlineFeatureInterface *src_;
|
||||
int32 dim_; // Output feature dimension, set in initializer.
|
||||
|
||||
struct NormalizationStats {
|
||||
int32 cur_num_frames; // value of src_->NumFramesReady() when
|
||||
// "mean_pitch" was set.
|
||||
bool input_finished; // true if input data was finished when
|
||||
// "mean_pitch" was computed.
|
||||
double sum_pov; // sum of pov over relevant range
|
||||
double sum_log_pitch_pov; // sum of log(pitch) * pov over relevant range
|
||||
|
||||
NormalizationStats(): cur_num_frames(-1), input_finished(false),
|
||||
sum_pov(0.0), sum_log_pitch_pov(0.0) { }
|
||||
};
|
||||
|
||||
std::vector<BaseFloat> delta_feature_noise_;
|
||||
|
||||
std::vector<NormalizationStats> normalization_stats_;
|
||||
|
||||
/// Computes and returns the POV feature for this frame.
|
||||
/// Called from GetFrame().
|
||||
inline BaseFloat GetPovFeature(int32 frame) const;
|
||||
|
||||
/// Computes and returns the delta-log-pitch feature for this frame.
|
||||
/// Called from GetFrame().
|
||||
inline BaseFloat GetDeltaPitchFeature(int32 frame);
|
||||
|
||||
/// Computes and returns the raw log-pitch feature for this frame.
|
||||
/// Called from GetFrame().
|
||||
inline BaseFloat GetRawLogPitchFeature(int32 frame) const;
|
||||
|
||||
/// Computes and returns the mean-subtracted log-pitch feature for this frame.
|
||||
/// Called from GetFrame().
|
||||
inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);
|
||||
|
||||
/// Computes the normalization window sizes.
|
||||
inline void GetNormalizationWindow(int32 frame,
|
||||
int32 src_frames_ready,
|
||||
int32 *window_begin,
|
||||
int32 *window_end) const;
|
||||
|
||||
/// Makes sure the entry in normalization_stats_ for this frame is up to date;
|
||||
/// called from GetNormalizedLogPitchFeature.
|
||||
inline void UpdateNormalizationStats(int32 frame);
|
||||
};
|
||||
|
||||
|
||||
/// This function extracts (pitch, NCCF) per frame, using the pitch extraction
|
||||
/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech
|
||||
/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian
|
||||
/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014. The output will
|
||||
/// have as many rows as there are frames, and two columns corresponding to
|
||||
/// (NCCF, pitch)
|
||||
void ComputeKaldiPitch(const PitchExtractionOptions &opts,
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
Matrix<BaseFloat> *output);
|
||||
|
||||
/// This function processes the raw (NCCF, pitch) quantities computed by
|
||||
/// ComputeKaldiPitch, and processes them into features. By default it will
|
||||
/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch,
|
||||
/// delta-of-raw-pitch), but this is configurable in the options. The number of
|
||||
/// rows of "output" will be the number of frames (rows) in "input", and the
|
||||
/// number of columns will be the number of different types of features
|
||||
/// requested (by default, 3; 4 is the max). The four config variables
|
||||
/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch,
|
||||
/// --add-raw-log-pitch determine which features we create; by default we create
|
||||
/// the first three.
|
||||
void ProcessPitch(const ProcessPitchOptions &opts,
|
||||
const MatrixBase<BaseFloat> &input,
|
||||
Matrix<BaseFloat> *output);
|
||||
|
||||
/// This function combines ComputeKaldiPitch and ProcessPitch. The reason
|
||||
/// why we need a separate function to do this is in order to be able to
|
||||
/// accurately simulate the online pitch-processing, for testing and for
|
||||
/// training models matched to the "first-pass" features. It is sensitive to
|
||||
/// the variables in pitch_opts that relate to online processing,
|
||||
/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online,
|
||||
/// recompute_frame.
|
||||
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts,
|
||||
const ProcessPitchOptions &process_opts,
|
||||
const VectorBase<BaseFloat> &wave,
|
||||
Matrix<BaseFloat> *output);
|
||||
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
#endif // KALDI_FEAT_PITCH_FUNCTIONS_H_
|
@ -1,377 +0,0 @@
|
||||
// feat/resample.cc
|
||||
|
||||
// Copyright 2013 Pegah Ghahremani
|
||||
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
// 2014 Yanqing Sun, Junjie Wang
|
||||
// 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include "feat/feature-functions.h"
|
||||
#include "matrix/matrix-functions.h"
|
||||
#include "feat/resample.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
LinearResample::LinearResample(int32 samp_rate_in_hz,
|
||||
int32 samp_rate_out_hz,
|
||||
BaseFloat filter_cutoff_hz,
|
||||
int32 num_zeros):
|
||||
samp_rate_in_(samp_rate_in_hz),
|
||||
samp_rate_out_(samp_rate_out_hz),
|
||||
filter_cutoff_(filter_cutoff_hz),
|
||||
num_zeros_(num_zeros) {
|
||||
KALDI_ASSERT(samp_rate_in_hz > 0.0 &&
|
||||
samp_rate_out_hz > 0.0 &&
|
||||
filter_cutoff_hz > 0.0 &&
|
||||
filter_cutoff_hz*2 <= samp_rate_in_hz &&
|
||||
filter_cutoff_hz*2 <= samp_rate_out_hz &&
|
||||
num_zeros > 0);
|
||||
|
||||
// base_freq is the frequency of the repeating unit, which is the gcd
|
||||
// of the input frequencies.
|
||||
int32 base_freq = Gcd(samp_rate_in_, samp_rate_out_);
|
||||
input_samples_in_unit_ = samp_rate_in_ / base_freq;
|
||||
output_samples_in_unit_ = samp_rate_out_ / base_freq;
|
||||
|
||||
SetIndexesAndWeights();
|
||||
Reset();
|
||||
}
|
||||
|
||||
int64 LinearResample::GetNumOutputSamples(int64 input_num_samp,
|
||||
bool flush) const {
|
||||
// For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
|
||||
// where tick_freq is the least common multiple of samp_rate_in_ and
|
||||
// samp_rate_out_.
|
||||
int32 tick_freq = Lcm(samp_rate_in_, samp_rate_out_);
|
||||
int32 ticks_per_input_period = tick_freq / samp_rate_in_;
|
||||
|
||||
// work out the number of ticks in the time interval
|
||||
// [ 0, input_num_samp/samp_rate_in_ ).
|
||||
int64 interval_length_in_ticks = input_num_samp * ticks_per_input_period;
|
||||
if (!flush) {
|
||||
BaseFloat window_width = num_zeros_ / (2.0 * filter_cutoff_);
|
||||
// To count the window-width in ticks we take the floor. This
|
||||
// is because since we're looking for the largest integer num-out-samp
|
||||
// that fits in the interval, which is open on the right, a reduction
|
||||
// in interval length of less than a tick will never make a difference.
|
||||
// For example, the largest integer in the interval [ 0, 2 ) and the
|
||||
// largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one).
|
||||
// So when we're subtracting the window-width we can ignore the fractional
|
||||
// part.
|
||||
int32 window_width_ticks = floor(window_width * tick_freq);
|
||||
// The time-period of the output that we can sample gets reduced
|
||||
// by the window-width (which is actually the distance from the
|
||||
// center to the edge of the windowing function) if we're not
|
||||
// "flushing the output".
|
||||
interval_length_in_ticks -= window_width_ticks;
|
||||
}
|
||||
if (interval_length_in_ticks <= 0)
|
||||
return 0;
|
||||
int32 ticks_per_output_period = tick_freq / samp_rate_out_;
|
||||
// Get the last output-sample in the closed interval, i.e. replacing [ ) with
|
||||
// [ ]. Note: integer division rounds down. See
|
||||
// http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of
|
||||
// the notation.
|
||||
int64 last_output_samp = interval_length_in_ticks / ticks_per_output_period;
|
||||
// We need the last output-sample in the open interval, so if it takes us to
|
||||
// the end of the interval exactly, subtract one.
|
||||
if (last_output_samp * ticks_per_output_period == interval_length_in_ticks)
|
||||
last_output_samp--;
|
||||
// First output-sample index is zero, so the number of output samples
|
||||
// is the last output-sample plus one.
|
||||
int64 num_output_samp = last_output_samp + 1;
|
||||
return num_output_samp;
|
||||
}
|
||||
|
||||
void LinearResample::SetIndexesAndWeights() {
|
||||
first_index_.resize(output_samples_in_unit_);
|
||||
weights_.resize(output_samples_in_unit_);
|
||||
|
||||
double window_width = num_zeros_ / (2.0 * filter_cutoff_);
|
||||
|
||||
for (int32 i = 0; i < output_samples_in_unit_; i++) {
|
||||
double output_t = i / static_cast<double>(samp_rate_out_);
|
||||
double min_t = output_t - window_width, max_t = output_t + window_width;
|
||||
// we do ceil on the min and floor on the max, because if we did it
|
||||
// the other way around we would unnecessarily include indexes just
|
||||
// outside the window, with zero coefficients. It's possible
|
||||
// if the arguments to the ceil and floor expressions are integers
|
||||
// (e.g. if filter_cutoff_ has an exact ratio with the sample rates),
|
||||
// that we unnecessarily include something with a zero coefficient,
|
||||
// but this is only a slight efficiency issue.
|
||||
int32 min_input_index = ceil(min_t * samp_rate_in_),
|
||||
max_input_index = floor(max_t * samp_rate_in_),
|
||||
num_indices = max_input_index - min_input_index + 1;
|
||||
first_index_[i] = min_input_index;
|
||||
weights_[i].Resize(num_indices);
|
||||
for (int32 j = 0; j < num_indices; j++) {
|
||||
int32 input_index = min_input_index + j;
|
||||
double input_t = input_index / static_cast<double>(samp_rate_in_),
|
||||
delta_t = input_t - output_t;
|
||||
// sign of delta_t doesn't matter.
|
||||
weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// inline
|
||||
void LinearResample::GetIndexes(int64 samp_out,
|
||||
int64 *first_samp_in,
|
||||
int32 *samp_out_wrapped) const {
|
||||
// A unit is the smallest nonzero amount of time that is an exact
|
||||
// multiple of the input and output sample periods. The unit index
|
||||
// is the answer to "which numbered unit we are in".
|
||||
int64 unit_index = samp_out / output_samples_in_unit_;
|
||||
// samp_out_wrapped is equal to samp_out % output_samples_in_unit_
|
||||
*samp_out_wrapped = static_cast<int32>(samp_out -
|
||||
unit_index * output_samples_in_unit_);
|
||||
*first_samp_in = first_index_[*samp_out_wrapped] +
|
||||
unit_index * input_samples_in_unit_;
|
||||
}
|
||||
|
||||
|
||||
void LinearResample::Resample(const VectorBase<BaseFloat> &input,
|
||||
bool flush,
|
||||
Vector<BaseFloat> *output) {
|
||||
int32 input_dim = input.Dim();
|
||||
int64 tot_input_samp = input_sample_offset_ + input_dim,
|
||||
tot_output_samp = GetNumOutputSamples(tot_input_samp, flush);
|
||||
|
||||
KALDI_ASSERT(tot_output_samp >= output_sample_offset_);
|
||||
|
||||
output->Resize(tot_output_samp - output_sample_offset_);
|
||||
|
||||
// samp_out is the index into the total output signal, not just the part
|
||||
// of it we are producing here.
|
||||
for (int64 samp_out = output_sample_offset_;
|
||||
samp_out < tot_output_samp;
|
||||
samp_out++) {
|
||||
int64 first_samp_in;
|
||||
int32 samp_out_wrapped;
|
||||
GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped);
|
||||
const Vector<BaseFloat> &weights = weights_[samp_out_wrapped];
|
||||
// first_input_index is the first index into "input" that we have a weight
|
||||
// for.
|
||||
int32 first_input_index = static_cast<int32>(first_samp_in -
|
||||
input_sample_offset_);
|
||||
BaseFloat this_output;
|
||||
if (first_input_index >= 0 &&
|
||||
first_input_index + weights.Dim() <= input_dim) {
|
||||
SubVector<BaseFloat> input_part(input, first_input_index, weights.Dim());
|
||||
this_output = VecVec(input_part, weights);
|
||||
} else { // Handle edge cases.
|
||||
this_output = 0.0;
|
||||
for (int32 i = 0; i < weights.Dim(); i++) {
|
||||
BaseFloat weight = weights(i);
|
||||
int32 input_index = first_input_index + i;
|
||||
if (input_index < 0 && input_remainder_.Dim() + input_index >= 0) {
|
||||
this_output += weight *
|
||||
input_remainder_(input_remainder_.Dim() + input_index);
|
||||
} else if (input_index >= 0 && input_index < input_dim) {
|
||||
this_output += weight * input(input_index);
|
||||
} else if (input_index >= input_dim) {
|
||||
// We're past the end of the input and are adding zero; should only
|
||||
// happen if the user specified flush == true, or else we would not
|
||||
// be trying to output this sample.
|
||||
KALDI_ASSERT(flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
int32 output_index = static_cast<int32>(samp_out - output_sample_offset_);
|
||||
(*output)(output_index) = this_output;
|
||||
}
|
||||
|
||||
if (flush) {
|
||||
Reset(); // Reset the internal state.
|
||||
} else {
|
||||
SetRemainder(input);
|
||||
input_sample_offset_ = tot_input_samp;
|
||||
output_sample_offset_ = tot_output_samp;
|
||||
}
|
||||
}
|
||||
|
||||
void LinearResample::SetRemainder(const VectorBase<BaseFloat> &input) {
|
||||
Vector<BaseFloat> old_remainder(input_remainder_);
|
||||
// max_remainder_needed is the width of the filter from side to side,
|
||||
// measured in input samples. you might think it should be half that,
|
||||
// but you have to consider that you might be wanting to output samples
|
||||
// that are "in the past" relative to the beginning of the latest
|
||||
// input... anyway, storing more remainder than needed is not harmful.
|
||||
int32 max_remainder_needed = ceil(samp_rate_in_ * num_zeros_ /
|
||||
filter_cutoff_);
|
||||
input_remainder_.Resize(max_remainder_needed);
|
||||
for (int32 index = - input_remainder_.Dim(); index < 0; index++) {
|
||||
// we interpret "index" as an offset from the end of "input" and
|
||||
// from the end of input_remainder_.
|
||||
int32 input_index = index + input.Dim();
|
||||
if (input_index >= 0)
|
||||
input_remainder_(index + input_remainder_.Dim()) = input(input_index);
|
||||
else if (input_index + old_remainder.Dim() >= 0)
|
||||
input_remainder_(index + input_remainder_.Dim()) =
|
||||
old_remainder(input_index + old_remainder.Dim());
|
||||
// else leave it at zero.
|
||||
}
|
||||
}
|
||||
|
||||
void LinearResample::Reset() {
|
||||
input_sample_offset_ = 0;
|
||||
output_sample_offset_ = 0;
|
||||
input_remainder_.Resize(0);
|
||||
}
|
||||
|
||||
/** Here, t is a time in seconds representing an offset from
|
||||
the center of the windowed filter function, and FilterFunction(t)
|
||||
returns the windowed filter function, described
|
||||
in the header as h(t) = f(t)g(t), evaluated at t.
|
||||
*/
|
||||
BaseFloat LinearResample::FilterFunc(BaseFloat t) const {
|
||||
BaseFloat window, // raised-cosine (Hanning) window of width
|
||||
// num_zeros_/2*filter_cutoff_
|
||||
filter; // sinc filter function
|
||||
if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
|
||||
window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
|
||||
else
|
||||
window = 0.0; // outside support of window function
|
||||
if (t != 0)
|
||||
filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
|
||||
else
|
||||
filter = 2 * filter_cutoff_; // limit of the function at t = 0
|
||||
return filter * window;
|
||||
}
|
||||
|
||||
|
||||
ArbitraryResample::ArbitraryResample(
|
||||
int32 num_samples_in, BaseFloat samp_rate_in,
|
||||
BaseFloat filter_cutoff, const Vector<BaseFloat> &sample_points,
|
||||
int32 num_zeros):
|
||||
num_samples_in_(num_samples_in),
|
||||
samp_rate_in_(samp_rate_in),
|
||||
filter_cutoff_(filter_cutoff),
|
||||
num_zeros_(num_zeros) {
|
||||
KALDI_ASSERT(num_samples_in > 0 && samp_rate_in > 0.0 &&
|
||||
filter_cutoff > 0.0 &&
|
||||
filter_cutoff * 2.0 <= samp_rate_in
|
||||
&& num_zeros > 0);
|
||||
// set up weights_ and indices_. Please try to keep all functions short and
|
||||
SetIndexes(sample_points);
|
||||
SetWeights(sample_points);
|
||||
}
|
||||
|
||||
|
||||
void ArbitraryResample::Resample(const MatrixBase<BaseFloat> &input,
|
||||
MatrixBase<BaseFloat> *output) const {
|
||||
// each row of "input" corresponds to the data to resample;
|
||||
// the corresponding row of "output" is the resampled data.
|
||||
|
||||
KALDI_ASSERT(input.NumRows() == output->NumRows() &&
|
||||
input.NumCols() == num_samples_in_ &&
|
||||
output->NumCols() == weights_.size());
|
||||
|
||||
Vector<BaseFloat> output_col(output->NumRows());
|
||||
for (int32 i = 0; i < NumSamplesOut(); i++) {
|
||||
SubMatrix<BaseFloat> input_part(input, 0, input.NumRows(),
|
||||
first_index_[i],
|
||||
weights_[i].Dim());
|
||||
const Vector<BaseFloat> &weight_vec(weights_[i]);
|
||||
output_col.AddMatVec(1.0, input_part,
|
||||
kNoTrans, weight_vec, 0.0);
|
||||
output->CopyColFromVec(output_col, i);
|
||||
}
|
||||
}
|
||||
|
||||
void ArbitraryResample::Resample(const VectorBase<BaseFloat> &input,
|
||||
VectorBase<BaseFloat> *output) const {
|
||||
KALDI_ASSERT(input.Dim() == num_samples_in_ &&
|
||||
output->Dim() == weights_.size());
|
||||
|
||||
int32 output_dim = output->Dim();
|
||||
for (int32 i = 0; i < output_dim; i++) {
|
||||
SubVector<BaseFloat> input_part(input, first_index_[i], weights_[i].Dim());
|
||||
(*output)(i) = VecVec(input_part, weights_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ArbitraryResample::SetIndexes(const Vector<BaseFloat> &sample_points) {
|
||||
int32 num_samples = sample_points.Dim();
|
||||
first_index_.resize(num_samples);
|
||||
weights_.resize(num_samples);
|
||||
BaseFloat filter_width = num_zeros_ / (2.0 * filter_cutoff_);
|
||||
for (int32 i = 0; i < num_samples; i++) {
|
||||
// the t values are in seconds.
|
||||
BaseFloat t = sample_points(i),
|
||||
t_min = t - filter_width, t_max = t + filter_width;
|
||||
int32 index_min = ceil(samp_rate_in_ * t_min),
|
||||
index_max = floor(samp_rate_in_ * t_max);
|
||||
// the ceil on index min and the floor on index_max are because there
|
||||
// is no point using indices just outside the window (coeffs would be zero).
|
||||
if (index_min < 0)
|
||||
index_min = 0;
|
||||
if (index_max >= num_samples_in_)
|
||||
index_max = num_samples_in_ - 1;
|
||||
first_index_[i] = index_min;
|
||||
weights_[i].Resize(index_max - index_min + 1);
|
||||
}
|
||||
}
|
||||
|
||||
void ArbitraryResample::SetWeights(const Vector<BaseFloat> &sample_points) {
|
||||
int32 num_samples_out = NumSamplesOut();
|
||||
for (int32 i = 0; i < num_samples_out; i++) {
|
||||
for (int32 j = 0 ; j < weights_[i].Dim(); j++) {
|
||||
BaseFloat delta_t = sample_points(i) -
|
||||
(first_index_[i] + j) / samp_rate_in_;
|
||||
// Include at this point the factor of 1.0 / samp_rate_in_ which
|
||||
// appears in the math.
|
||||
weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Here, t is a time in seconds representing an offset from
|
||||
the center of the windowed filter function, and FilterFunction(t)
|
||||
returns the windowed filter function, described
|
||||
in the header as h(t) = f(t)g(t), evaluated at t.
|
||||
*/
|
||||
BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const {
|
||||
BaseFloat window, // raised-cosine (Hanning) window of width
|
||||
// num_zeros_/2*filter_cutoff_
|
||||
filter; // sinc filter function
|
||||
if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
|
||||
window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
|
||||
else
|
||||
window = 0.0; // outside support of window function
|
||||
if (t != 0.0)
|
||||
filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
|
||||
else
|
||||
filter = 2.0 * filter_cutoff_; // limit of the function at zero.
|
||||
return filter * window;
|
||||
}
|
||||
|
||||
void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
|
||||
BaseFloat min_freq = std::min(orig_freq, new_freq);
|
||||
BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
||||
int32 lowpass_filter_width = 6;
|
||||
LinearResample resampler(orig_freq, new_freq,
|
||||
lowpass_cutoff, lowpass_filter_width);
|
||||
resampler.Resample(wave, true, new_wave);
|
||||
}
|
||||
} // namespace kaldi
|
@ -1,287 +0,0 @@
|
||||
// feat/resample.h
|
||||
|
||||
// Copyright 2013 Pegah Ghahremani
|
||||
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
// 2014 Yanqing Sun, Junjie Wang
|
||||
// 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef KALDI_FEAT_RESAMPLE_H_
|
||||
#define KALDI_FEAT_RESAMPLE_H_
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
#include "matrix/matrix-lib.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "base/kaldi-error.h"
|
||||
|
||||
namespace kaldi {
|
||||
/// @addtogroup feat FeatureExtraction
|
||||
/// @{
|
||||
|
||||
/**
|
||||
\file[resample.h]
|
||||
|
||||
This header contains declarations of classes for resampling signals. The
|
||||
normal cases of resampling a signal are upsampling and downsampling
|
||||
(increasing and decreasing the sample rate of a signal, respectively),
|
||||
although the ArbitraryResample class allows a more generic case where
|
||||
we want to get samples of a signal at uneven intervals (for instance,
|
||||
log-spaced).
|
||||
|
||||
The input signal is always evenly spaced, say sampled with frequency S, and
|
||||
we assume the original signal was band-limited to S/2 or lower. The n'th
|
||||
input sample x_n (with n = 0, 1, ...) is interpreted as the original
|
||||
signal's value at time n/S.
|
||||
|
||||
For resampling, it is convenient to view the input signal as a
|
||||
continuous function x(t) of t, where each sample x_n becomes a delta function
|
||||
with magnitude x_n/S, at time n/S. If we band limit this to the Nyquist
|
||||
frequency S/2, we can show that this is the same as the original signal
|
||||
that was sampled. [assuming the original signal was periodic and band
|
||||
limited.] In general we want to bandlimit to lower than S/2, because
|
||||
we don't have a perfect filter and also because if we want to resample
|
||||
at a lower frequency than S, we need to bandlimit to below half of that.
|
||||
Anyway, suppose we want to bandlimit to C, with 0 < C < S/2. The perfect
|
||||
rectangular filter with cutoff C is the sinc function,
|
||||
\f[ f(t) = 2C sinc(2Ct), \f]
|
||||
where sinc is the normalized sinc function \f$ sinc(t) = sin(pi t) / (pi t) \f$, with
|
||||
\f$ sinc(0) = 1 \f$. This is not a practical filter, though, because it has
|
||||
infinite support. At the cost of less-than-perfect rolloff, we can choose
|
||||
a suitable windowing function g(t), and use f(t) g(t) as the filter. For
|
||||
a windowing function we choose raised-cosine (Hanning) window with support
|
||||
on [-w/2C, w/2C], where w >= 2 is an integer chosen by the user. w = 1
|
||||
means we window the sinc function out to its first zero on the left and right,
|
||||
w = 2 means the second zero, and so on; we normally choose w to be at least two.
|
||||
We call this num_zeros, not w, in the code.
|
||||
|
||||
Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting
|
||||
signal s(t) at an arbitrary time t is easy: we have
|
||||
\f[ s(t) = 1/S \sum_n x_n h(t - n/S) \f].
|
||||
(note: the sign of t - n/S might be wrong, but it doesn't matter as the filter
|
||||
and window are symmetric).
|
||||
This is true for arbitrary values of t. What the class ArbitraryResample does
|
||||
is to allow you to evaluate the signal for specified values of t.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
Class ArbitraryResample allows you to resample a signal (assumed zero outside
|
||||
the sample region, not periodic) at arbitrary specified time values, which
|
||||
don't have to be linearly spaced. The low-pass filter cutoff
|
||||
"filter_cutoff_hz" should be less than half the sample rate;
|
||||
"num_zeros" should probably be at least two preferably more; higher numbers give
|
||||
sharper filters but will be less efficient.
|
||||
*/
|
||||
class ArbitraryResample {
|
||||
public:
|
||||
ArbitraryResample(int32 num_samples_in,
|
||||
BaseFloat samp_rate_hz,
|
||||
BaseFloat filter_cutoff_hz,
|
||||
const Vector<BaseFloat> &sample_points_secs,
|
||||
int32 num_zeros);
|
||||
|
||||
int32 NumSamplesIn() const { return num_samples_in_; }
|
||||
|
||||
int32 NumSamplesOut() const { return weights_.size(); }
|
||||
|
||||
/// This function does the resampling.
|
||||
/// input.NumRows() and output.NumRows() should be equal
|
||||
/// and nonzero.
|
||||
/// input.NumCols() should equal NumSamplesIn()
|
||||
/// and output.NumCols() should equal NumSamplesOut().
|
||||
void Resample(const MatrixBase<BaseFloat> &input,
|
||||
MatrixBase<BaseFloat> *output) const;
|
||||
|
||||
/// This version of the Resample function processes just
|
||||
/// one vector.
|
||||
void Resample(const VectorBase<BaseFloat> &input,
|
||||
VectorBase<BaseFloat> *output) const;
|
||||
private:
|
||||
void SetIndexes(const Vector<BaseFloat> &sample_points);
|
||||
|
||||
void SetWeights(const Vector<BaseFloat> &sample_points);
|
||||
|
||||
BaseFloat FilterFunc(BaseFloat t) const;
|
||||
|
||||
int32 num_samples_in_;
|
||||
BaseFloat samp_rate_in_;
|
||||
BaseFloat filter_cutoff_;
|
||||
int32 num_zeros_;
|
||||
|
||||
std::vector<int32> first_index_; // The first input-sample index that we sum
|
||||
// over, for this output-sample index.
|
||||
std::vector<Vector<BaseFloat> > weights_;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
LinearResample is a special case of ArbitraryResample, where we want to
|
||||
resample a signal at linearly spaced intervals (this means we want to
|
||||
upsample or downsample the signal). It is more efficient than
|
||||
ArbitraryResample because we can construct it just once.
|
||||
|
||||
We require that the input and output sampling rate be specified as
|
||||
integers, as this is an easy way to specify that their ratio be rational.
|
||||
*/
|
||||
|
||||
class LinearResample {
|
||||
public:
|
||||
/// Constructor. We make the input and output sample rates integers, because
|
||||
/// we are going to need to find a common divisor. This should just remind
|
||||
/// you that they need to be integers. The filter cutoff needs to be less
|
||||
/// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros
|
||||
/// controls the sharpness of the filter, more == sharper but less efficient.
|
||||
/// We suggest around 4 to 10 for normal use.
|
||||
LinearResample(int32 samp_rate_in_hz,
|
||||
int32 samp_rate_out_hz,
|
||||
BaseFloat filter_cutoff_hz,
|
||||
int32 num_zeros);
|
||||
|
||||
/// This function does the resampling. If you call it with flush == true and
|
||||
/// you have never called it with flush == false, it just resamples the input
|
||||
/// signal (it resizes the output to a suitable number of samples).
|
||||
///
|
||||
/// You can also use this function to process a signal a piece at a time.
|
||||
/// suppose you break it into piece1, piece2, ... pieceN. You can call
|
||||
/// \code{.cc}
|
||||
/// Resample(piece1, &output1, false);
|
||||
/// Resample(piece2, &output2, false);
|
||||
/// Resample(piece3, &output3, true);
|
||||
/// \endcode
|
||||
/// If you call it with flush == false, it won't output the last few samples
|
||||
/// but will remember them, so that if you later give it a second piece of
|
||||
/// the input signal it can process it correctly.
|
||||
/// If your most recent call to the object was with flush == false, it will
|
||||
/// have internal state; you can remove this by calling Reset().
|
||||
/// Empty input is acceptable.
|
||||
void Resample(const VectorBase<BaseFloat> &input,
|
||||
bool flush,
|
||||
Vector<BaseFloat> *output);
|
||||
|
||||
/// Calling the function Reset() resets the state of the object prior to
|
||||
/// processing a new signal; it is only necessary if you have called
|
||||
/// Resample(x, y, false) for some signal, leading to a remainder of the
|
||||
/// signal being called, but then abandon processing the signal before calling
|
||||
/// Resample(x, y, true) for the last piece. Call it unnecessarily between
|
||||
/// signals will not do any harm.
|
||||
void Reset();
|
||||
|
||||
//// Return the input and output sampling rates (for checks, for example)
|
||||
inline int32 GetInputSamplingRate() { return samp_rate_in_; }
|
||||
inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
|
||||
private:
|
||||
/// This function outputs the number of output samples we will output
|
||||
/// for a signal with "input_num_samp" input samples. If flush == true,
|
||||
/// we return the largest n such that
|
||||
/// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ),
|
||||
/// and note that the interval is half-open. If flush == false,
|
||||
/// define window_width as num_zeros / (2.0 * filter_cutoff_);
|
||||
/// we return the largest n such that (n/samp_rate_out_) is in the interval
|
||||
/// [ 0, input_num_samp/samp_rate_in_ - window_width ).
|
||||
int64 GetNumOutputSamples(int64 input_num_samp, bool flush) const;
|
||||
|
||||
|
||||
/// Given an output-sample index, this function outputs to *first_samp_in the
|
||||
/// first input-sample index that we have a weight on (may be negative),
|
||||
/// and to *samp_out_wrapped the index into weights_ where we can get the
|
||||
/// corresponding weights on the input.
|
||||
inline void GetIndexes(int64 samp_out,
|
||||
int64 *first_samp_in,
|
||||
int32 *samp_out_wrapped) const;
|
||||
|
||||
void SetRemainder(const VectorBase<BaseFloat> &input);
|
||||
|
||||
void SetIndexesAndWeights();
|
||||
|
||||
BaseFloat FilterFunc(BaseFloat) const;
|
||||
|
||||
// The following variables are provided by the user.
|
||||
int32 samp_rate_in_;
|
||||
int32 samp_rate_out_;
|
||||
BaseFloat filter_cutoff_;
|
||||
int32 num_zeros_;
|
||||
|
||||
int32 input_samples_in_unit_; ///< The number of input samples in the
|
||||
///< smallest repeating unit: num_samp_in_ =
|
||||
///< samp_rate_in_hz / Gcd(samp_rate_in_hz,
|
||||
///< samp_rate_out_hz)
|
||||
int32 output_samples_in_unit_; ///< The number of output samples in the
|
||||
///< smallest repeating unit: num_samp_out_ =
|
||||
///< samp_rate_out_hz / Gcd(samp_rate_in_hz,
|
||||
///< samp_rate_out_hz)
|
||||
|
||||
|
||||
/// The first input-sample index that we sum over, for this output-sample
|
||||
/// index. May be negative; any truncation at the beginning is handled
|
||||
/// separately. This is just for the first few output samples, but we can
|
||||
/// extrapolate the correct input-sample index for arbitrary output samples.
|
||||
std::vector<int32> first_index_;
|
||||
|
||||
/// Weights on the input samples, for this output-sample index.
|
||||
std::vector<Vector<BaseFloat> > weights_;
|
||||
|
||||
// the following variables keep track of where we are in a particular signal,
|
||||
// if it is being provided over multiple calls to Resample().
|
||||
|
||||
int64 input_sample_offset_; ///< The number of input samples we have
|
||||
///< already received for this signal
|
||||
///< (including anything in remainder_)
|
||||
int64 output_sample_offset_; ///< The number of samples we have already
|
||||
///< output for this signal.
|
||||
Vector<BaseFloat> input_remainder_; ///< A small trailing part of the
|
||||
///< previously seen input signal.
|
||||
};
|
||||
|
||||
/**
|
||||
Downsample or upsample a waveform. This is a convenience wrapper for the
|
||||
class 'LinearResample'.
|
||||
The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist,
|
||||
where the Nyquist is half of the minimum of (orig_freq, new_freq). The
|
||||
resampling is done with a symmetric FIR filter with N_z (number of zeros)
|
||||
as 6.
|
||||
|
||||
We compared the downsampling results with those from the sox resampling
|
||||
toolkit.
|
||||
Sox's design is inspired by Laurent De Soras' paper,
|
||||
https://ccrma.stanford.edu/~jos/resample/Implementation.html
|
||||
|
||||
Note: we expect that while orig_freq and new_freq are of type BaseFloat, they
|
||||
are actually required to have exact integer values (like 16000 or 8000) with
|
||||
a ratio between them that can be expressed as a rational number with
|
||||
reasonably small integer factors.
|
||||
*/
|
||||
void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat new_freq, Vector<BaseFloat> *new_wave);
|
||||
|
||||
|
||||
/// This function is deprecated. It is provided for backward compatibility, to avoid
|
||||
/// breaking older code.
|
||||
inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
|
||||
BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
|
||||
ResampleWaveform(orig_freq, wave, new_freq, new_wave);
|
||||
}
|
||||
|
||||
|
||||
/// @} End of "addtogroup feat"
|
||||
} // namespace kaldi
|
||||
#endif // KALDI_FEAT_RESAMPLE_H_
|
@ -1,129 +0,0 @@
|
||||
// feat/signal.cc
|
||||
|
||||
// Copyright 2015 Tom Ko
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "feat/signal.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
void ElementwiseProductOfFft(const Vector<BaseFloat> &a, Vector<BaseFloat> *b) {
|
||||
int32 num_fft_bins = a.Dim() / 2;
|
||||
for (int32 i = 0; i < num_fft_bins; i++) {
|
||||
// do complex multiplication
|
||||
ComplexMul(a(2*i), a(2*i + 1), &((*b)(2*i)), &((*b)(2*i + 1)));
|
||||
}
|
||||
}
|
||||
|
||||
void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
|
||||
int32 signal_length = signal->Dim();
|
||||
int32 filter_length = filter.Dim();
|
||||
int32 output_length = signal_length + filter_length - 1;
|
||||
Vector<BaseFloat> signal_padded(output_length);
|
||||
signal_padded.SetZero();
|
||||
for (int32 i = 0; i < signal_length; i++) {
|
||||
for (int32 j = 0; j < filter_length; j++) {
|
||||
signal_padded(i + j) += (*signal)(i) * filter(j);
|
||||
}
|
||||
}
|
||||
signal->Resize(output_length);
|
||||
signal->CopyFromVec(signal_padded);
|
||||
}
|
||||
|
||||
|
||||
void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
|
||||
int32 signal_length = signal->Dim();
|
||||
int32 filter_length = filter.Dim();
|
||||
int32 output_length = signal_length + filter_length - 1;
|
||||
|
||||
int32 fft_length = RoundUpToNearestPowerOfTwo(output_length);
|
||||
KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length;
|
||||
|
||||
SplitRadixRealFft<BaseFloat> srfft(fft_length);
|
||||
|
||||
Vector<BaseFloat> filter_padded(fft_length);
|
||||
filter_padded.Range(0, filter_length).CopyFromVec(filter);
|
||||
srfft.Compute(filter_padded.Data(), true);
|
||||
|
||||
Vector<BaseFloat> signal_padded(fft_length);
|
||||
signal_padded.Range(0, signal_length).CopyFromVec(*signal);
|
||||
srfft.Compute(signal_padded.Data(), true);
|
||||
|
||||
ElementwiseProductOfFft(filter_padded, &signal_padded);
|
||||
|
||||
srfft.Compute(signal_padded.Data(), false);
|
||||
signal_padded.Scale(1.0 / fft_length);
|
||||
|
||||
signal->Resize(output_length);
|
||||
signal->CopyFromVec(signal_padded.Range(0, output_length));
|
||||
}
|
||||
|
||||
void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
|
||||
int32 signal_length = signal->Dim();
|
||||
int32 filter_length = filter.Dim();
|
||||
int32 output_length = signal_length + filter_length - 1;
|
||||
signal->Resize(output_length, kCopyData);
|
||||
|
||||
KALDI_VLOG(1) << "Length of the filter is " << filter_length;
|
||||
|
||||
int32 fft_length = RoundUpToNearestPowerOfTwo(4 * filter_length);
|
||||
KALDI_VLOG(1) << "Best FFT length is " << fft_length;
|
||||
|
||||
int32 block_length = fft_length - filter_length + 1;
|
||||
KALDI_VLOG(1) << "Block size is " << block_length;
|
||||
SplitRadixRealFft<BaseFloat> srfft(fft_length);
|
||||
|
||||
Vector<BaseFloat> filter_padded(fft_length);
|
||||
filter_padded.Range(0, filter_length).CopyFromVec(filter);
|
||||
srfft.Compute(filter_padded.Data(), true);
|
||||
|
||||
Vector<BaseFloat> temp_pad(filter_length - 1);
|
||||
temp_pad.SetZero();
|
||||
Vector<BaseFloat> signal_block_padded(fft_length);
|
||||
|
||||
for (int32 po = 0; po < output_length; po += block_length) {
|
||||
// get a block of the signal
|
||||
int32 process_length = std::min(block_length, output_length - po);
|
||||
signal_block_padded.SetZero();
|
||||
signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length));
|
||||
|
||||
srfft.Compute(signal_block_padded.Data(), true);
|
||||
|
||||
ElementwiseProductOfFft(filter_padded, &signal_block_padded);
|
||||
|
||||
srfft.Compute(signal_block_padded.Data(), false);
|
||||
signal_block_padded.Scale(1.0 / fft_length);
|
||||
|
||||
// combine the block
|
||||
if (po + block_length < output_length) { // current block is not the last block
|
||||
signal->Range(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length));
|
||||
signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
|
||||
temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1));
|
||||
} else {
|
||||
signal->Range(po, output_length - po).CopyFromVec(
|
||||
signal_block_padded.Range(0, output_length - po));
|
||||
if (filter_length - 1 < output_length - po)
|
||||
signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
|
||||
else
|
||||
signal->Range(po, output_length - po).AddVec(1.0, temp_pad.Range(0, output_length - po));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,58 +0,0 @@
|
||||
// feat/signal.h
|
||||
|
||||
// Copyright 2015 Tom Ko
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_FEAT_SIGNAL_H_
|
||||
#define KALDI_FEAT_SIGNAL_H_
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
The following three functions are having the same functionality but
|
||||
different implementations so as the efficiency. After the convolution,
|
||||
the length of the signal will be extended to (original signal length +
|
||||
filter length - 1).
|
||||
*/
|
||||
|
||||
/*
|
||||
This function implements a simple non-FFT-based convolution of two signals.
|
||||
It is suggested to use the FFT-based convolution function which is more
|
||||
efficient.
|
||||
*/
|
||||
void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
|
||||
|
||||
/*
|
||||
This function implements FFT-based convolution of two signals.
|
||||
However this should be an inefficient version of BlockConvolveSignals()
|
||||
as it processes the entire signal with a single FFT.
|
||||
*/
|
||||
void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
|
||||
|
||||
/*
|
||||
This function implements FFT-based block convolution of two signals using
|
||||
overlap-add method. This is an efficient way to evaluate the discrete
|
||||
convolution of a long signal with a finite impulse response filter.
|
||||
*/
|
||||
void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif // KALDI_FEAT_SIGNAL_H_
|
@ -1,16 +0,0 @@
|
||||
|
||||
add_library(kaldi-matrix
|
||||
compressed-matrix.cc
|
||||
kaldi-matrix.cc
|
||||
kaldi-vector.cc
|
||||
matrix-functions.cc
|
||||
optimization.cc
|
||||
packed-matrix.cc
|
||||
qr.cc
|
||||
sparse-matrix.cc
|
||||
sp-matrix.cc
|
||||
srfft.cc
|
||||
tp-matrix.cc
|
||||
)
|
||||
|
||||
target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a)
|
@ -1,491 +0,0 @@
|
||||
// matrix/cblas-wrappers.h
|
||||
|
||||
// Copyright 2012 Johns Hopkins University (author: Daniel Povey);
|
||||
// Haihua Xu; Wei Shi
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
|
||||
#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
|
||||
|
||||
|
||||
#include <limits>
|
||||
#include "matrix/sp-matrix.h"
|
||||
#include "matrix/kaldi-vector.h"
|
||||
#include "matrix/kaldi-matrix.h"
|
||||
#include "matrix/matrix-functions.h"
|
||||
#include "matrix/kaldi-blas.h"
|
||||
|
||||
// Do not include this file directly. It is to be included
|
||||
// by .cc files in this directory.
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
|
||||
const int incY) {
|
||||
cblas_scopy(N, X, incX, Y, incY);
|
||||
}
|
||||
|
||||
inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
|
||||
const int incY) {
|
||||
cblas_dcopy(N, X, incX, Y, incY);
|
||||
}
|
||||
|
||||
|
||||
inline float cblas_Xasum(const int N, const float *X, const int incX) {
|
||||
return cblas_sasum(N, X, incX);
|
||||
}
|
||||
|
||||
inline double cblas_Xasum(const int N, const double *X, const int incX) {
|
||||
return cblas_dasum(N, X, incX);
|
||||
}
|
||||
|
||||
inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
|
||||
const int incY, const float c, const float s) {
|
||||
cblas_srot(N, X, incX, Y, incY, c, s);
|
||||
}
|
||||
inline void cblas_Xrot(const int N, double *X, const int incX, double *Y,
|
||||
const int incY, const double c, const double s) {
|
||||
cblas_drot(N, X, incX, Y, incY, c, s);
|
||||
}
|
||||
inline float cblas_Xdot(const int N, const float *const X,
|
||||
const int incX, const float *const Y,
|
||||
const int incY) {
|
||||
return cblas_sdot(N, X, incX, Y, incY);
|
||||
}
|
||||
inline double cblas_Xdot(const int N, const double *const X,
|
||||
const int incX, const double *const Y,
|
||||
const int incY) {
|
||||
return cblas_ddot(N, X, incX, Y, incY);
|
||||
}
|
||||
inline void cblas_Xaxpy(const int N, const float alpha, const float *X,
|
||||
const int incX, float *Y, const int incY) {
|
||||
cblas_saxpy(N, alpha, X, incX, Y, incY);
|
||||
}
|
||||
inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
|
||||
const int incX, double *Y, const int incY) {
|
||||
cblas_daxpy(N, alpha, X, incX, Y, incY);
|
||||
}
|
||||
inline void cblas_Xscal(const int N, const float alpha, float *data,
|
||||
const int inc) {
|
||||
cblas_sscal(N, alpha, data, inc);
|
||||
}
|
||||
inline void cblas_Xscal(const int N, const double alpha, double *data,
|
||||
const int inc) {
|
||||
cblas_dscal(N, alpha, data, inc);
|
||||
}
|
||||
inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata,
|
||||
const float *v, const int v_inc,
|
||||
const float beta, float *y, const int y_inc) {
|
||||
cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
|
||||
}
|
||||
inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata,
|
||||
const double *v, const int v_inc,
|
||||
const double beta, double *y, const int y_inc) {
|
||||
cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
|
||||
}
|
||||
inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
|
||||
const int num_rows, float *y, const int y_inc) {
|
||||
cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
|
||||
CblasNonUnit, num_rows, Mdata, y, y_inc);
|
||||
}
|
||||
inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata,
|
||||
const int num_rows, double *y, const int y_inc) {
|
||||
cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
|
||||
CblasNonUnit, num_rows, Mdata, y, y_inc);
|
||||
}
|
||||
|
||||
|
||||
inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata,
|
||||
const int num_rows, float *y, const int y_inc) {
|
||||
cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
|
||||
CblasNonUnit, num_rows, Mdata, y, y_inc);
|
||||
}
|
||||
inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata,
|
||||
const int num_rows, double *y, const int y_inc) {
|
||||
cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
|
||||
CblasNonUnit, num_rows, Mdata, y, y_inc);
|
||||
}
|
||||
|
||||
// x = alpha * M * y + beta * x
|
||||
inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata,
|
||||
const float *ydata, MatrixIndexT ystride,
|
||||
float beta, float *xdata, MatrixIndexT xstride) {
|
||||
cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
|
||||
ydata, ystride, beta, xdata, xstride);
|
||||
}
|
||||
inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata,
|
||||
const double *ydata, MatrixIndexT ystride,
|
||||
double beta, double *xdata, MatrixIndexT xstride) {
|
||||
cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
|
||||
ydata, ystride, beta, xdata, xstride);
|
||||
}
|
||||
|
||||
// Implements A += alpha * (x y' + y x'); A is symmetric matrix.
|
||||
inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata,
|
||||
MatrixIndexT incX, const float *Ydata, MatrixIndexT incY,
|
||||
float *Adata) {
|
||||
cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
|
||||
incX, Ydata, incY, Adata);
|
||||
}
|
||||
inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata,
|
||||
MatrixIndexT incX, const double *Ydata, MatrixIndexT incY,
|
||||
double *Adata) {
|
||||
cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
|
||||
incX, Ydata, incY, Adata);
|
||||
}
|
||||
|
||||
// Implements A += alpha * (x x'); A is symmetric matrix.
|
||||
inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata,
|
||||
MatrixIndexT incX, float *Adata) {
|
||||
cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
|
||||
}
|
||||
inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata,
|
||||
MatrixIndexT incX, double *Adata) {
|
||||
cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
|
||||
}
|
||||
|
||||
// sgemv,dgemv: y = alpha M x + beta y.
|
||||
inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
|
||||
MatrixIndexT num_cols, float alpha, const float *Mdata,
|
||||
MatrixIndexT stride, const float *xdata,
|
||||
MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
|
||||
cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
|
||||
num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
|
||||
}
|
||||
inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
|
||||
MatrixIndexT num_cols, double alpha, const double *Mdata,
|
||||
MatrixIndexT stride, const double *xdata,
|
||||
MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
|
||||
cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
|
||||
num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
|
||||
}
|
||||
|
||||
// sgbmv, dgmmv: y = alpha M x + + beta * y.
|
||||
inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
|
||||
MatrixIndexT num_cols, MatrixIndexT num_below,
|
||||
MatrixIndexT num_above, float alpha, const float *Mdata,
|
||||
MatrixIndexT stride, const float *xdata,
|
||||
MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
|
||||
cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
|
||||
num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
|
||||
incX, beta, ydata, incY);
|
||||
}
|
||||
inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
|
||||
MatrixIndexT num_cols, MatrixIndexT num_below,
|
||||
MatrixIndexT num_above, double alpha, const double *Mdata,
|
||||
MatrixIndexT stride, const double *xdata,
|
||||
MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
|
||||
cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
|
||||
num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
|
||||
incX, beta, ydata, incY);
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
|
||||
MatrixIndexT num_cols, Real alpha, const Real *Mdata,
|
||||
MatrixIndexT stride, const Real *xdata,
|
||||
MatrixIndexT incX, Real beta, Real *ydata,
|
||||
MatrixIndexT incY) {
|
||||
if (trans == kNoTrans) {
|
||||
if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
|
||||
for (MatrixIndexT i = 0; i < num_cols; i++) {
|
||||
Real x_i = xdata[i * incX];
|
||||
if (x_i == 0.0) continue;
|
||||
// Add to ydata, the i'th column of M, times alpha * x_i
|
||||
cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
|
||||
}
|
||||
} else {
|
||||
if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
Real x_i = xdata[i * incX];
|
||||
if (x_i == 0.0) continue;
|
||||
// Add to ydata, the i'th row of M, times alpha * x_i
|
||||
cblas_Xaxpy(num_cols, x_i * alpha,
|
||||
Mdata + (i * stride), 1, ydata, incY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void cblas_Xgemm(const float alpha,
|
||||
MatrixTransposeType transA,
|
||||
const float *Adata,
|
||||
MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
|
||||
MatrixTransposeType transB,
|
||||
const float *Bdata, MatrixIndexT b_stride,
|
||||
const float beta,
|
||||
float *Mdata,
|
||||
MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
|
||||
cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
|
||||
static_cast<CBLAS_TRANSPOSE>(transB),
|
||||
num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
|
||||
alpha, Adata, a_stride, Bdata, b_stride,
|
||||
beta, Mdata, stride);
|
||||
}
|
||||
inline void cblas_Xgemm(const double alpha,
|
||||
MatrixTransposeType transA,
|
||||
const double *Adata,
|
||||
MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
|
||||
MatrixTransposeType transB,
|
||||
const double *Bdata, MatrixIndexT b_stride,
|
||||
const double beta,
|
||||
double *Mdata,
|
||||
MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
|
||||
cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
|
||||
static_cast<CBLAS_TRANSPOSE>(transB),
|
||||
num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
|
||||
alpha, Adata, a_stride, Bdata, b_stride,
|
||||
beta, Mdata, stride);
|
||||
}
|
||||
|
||||
|
||||
inline void cblas_Xsymm(const float alpha,
|
||||
MatrixIndexT sz,
|
||||
const float *Adata,MatrixIndexT a_stride,
|
||||
const float *Bdata,MatrixIndexT b_stride,
|
||||
const float beta,
|
||||
float *Mdata, MatrixIndexT stride) {
|
||||
cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
|
||||
a_stride, Bdata, b_stride, beta, Mdata, stride);
|
||||
}
|
||||
inline void cblas_Xsymm(const double alpha,
|
||||
MatrixIndexT sz,
|
||||
const double *Adata,MatrixIndexT a_stride,
|
||||
const double *Bdata,MatrixIndexT b_stride,
|
||||
const double beta,
|
||||
double *Mdata, MatrixIndexT stride) {
|
||||
cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
|
||||
a_stride, Bdata, b_stride, beta, Mdata, stride);
|
||||
}
|
||||
// ger: M += alpha x y^T.
|
||||
inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha,
|
||||
const float *xdata, MatrixIndexT incX, const float *ydata,
|
||||
MatrixIndexT incY, float *Mdata, MatrixIndexT stride) {
|
||||
cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
|
||||
Mdata, stride);
|
||||
}
|
||||
inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha,
|
||||
const double *xdata, MatrixIndexT incX, const double *ydata,
|
||||
MatrixIndexT incY, double *Mdata, MatrixIndexT stride) {
|
||||
cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
|
||||
Mdata, stride);
|
||||
}
|
||||
|
||||
// syrk: symmetric rank-k update.
|
||||
// if trans==kNoTrans, then C = alpha A A^T + beta C
|
||||
// else C = alpha A^T A + beta C.
|
||||
// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
|
||||
// num-cols(A) if kNoTrans, or num-rows(A) if kTrans.
|
||||
// We only need the row-major and lower-triangular option of this, and this
|
||||
// is hard-coded.
|
||||
inline void cblas_Xsyrk (
|
||||
const MatrixTransposeType trans, const MatrixIndexT dim_c,
|
||||
const MatrixIndexT other_dim_a, const float alpha, const float *A,
|
||||
const MatrixIndexT a_stride, const float beta, float *C,
|
||||
const MatrixIndexT c_stride) {
|
||||
cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
|
||||
dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
|
||||
}
|
||||
|
||||
inline void cblas_Xsyrk(
|
||||
const MatrixTransposeType trans, const MatrixIndexT dim_c,
|
||||
const MatrixIndexT other_dim_a, const double alpha, const double *A,
|
||||
const MatrixIndexT a_stride, const double beta, double *C,
|
||||
const MatrixIndexT c_stride) {
|
||||
cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
|
||||
dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
|
||||
}
|
||||
|
||||
/// matrix-vector multiply using a banded matrix; we always call this
|
||||
/// with b = 1 meaning we're multiplying by a diagonal matrix. This is used for
|
||||
/// elementwise multiplication. We miss some of the arguments out of this
|
||||
/// wrapper.
|
||||
inline void cblas_Xsbmv1(
|
||||
const MatrixIndexT dim,
|
||||
const double *A,
|
||||
const double alpha,
|
||||
const double *x,
|
||||
const double beta,
|
||||
double *y) {
|
||||
cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
|
||||
1, x, 1, beta, y, 1);
|
||||
}
|
||||
|
||||
inline void cblas_Xsbmv1(
|
||||
const MatrixIndexT dim,
|
||||
const float *A,
|
||||
const float alpha,
|
||||
const float *x,
|
||||
const float beta,
|
||||
float *y) {
|
||||
cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
|
||||
1, x, 1, beta, y, 1);
|
||||
}
|
||||
|
||||
/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
|
||||
/// extend this somehow.
|
||||
inline void mul_elements(
|
||||
const MatrixIndexT dim,
|
||||
const double *a,
|
||||
double *b) { // does b *= a, elementwise.
|
||||
double c1, c2, c3, c4;
|
||||
MatrixIndexT i;
|
||||
for (i = 0; i + 4 <= dim; i += 4) {
|
||||
c1 = a[i] * b[i];
|
||||
c2 = a[i+1] * b[i+1];
|
||||
c3 = a[i+2] * b[i+2];
|
||||
c4 = a[i+3] * b[i+3];
|
||||
b[i] = c1;
|
||||
b[i+1] = c2;
|
||||
b[i+2] = c3;
|
||||
b[i+3] = c4;
|
||||
}
|
||||
for (; i < dim; i++)
|
||||
b[i] *= a[i];
|
||||
}
|
||||
|
||||
inline void mul_elements(
|
||||
const MatrixIndexT dim,
|
||||
const float *a,
|
||||
float *b) { // does b *= a, elementwise.
|
||||
float c1, c2, c3, c4;
|
||||
MatrixIndexT i;
|
||||
for (i = 0; i + 4 <= dim; i += 4) {
|
||||
c1 = a[i] * b[i];
|
||||
c2 = a[i+1] * b[i+1];
|
||||
c3 = a[i+2] * b[i+2];
|
||||
c4 = a[i+3] * b[i+3];
|
||||
b[i] = c1;
|
||||
b[i+1] = c2;
|
||||
b[i+2] = c3;
|
||||
b[i+3] = c4;
|
||||
}
|
||||
for (; i < dim; i++)
|
||||
b[i] *= a[i];
|
||||
}
|
||||
|
||||
|
||||
|
||||
// add clapack here
|
||||
#if !defined(HAVE_ATLAS)
|
||||
inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
|
||||
stptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
|
||||
}
|
||||
inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
|
||||
dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
|
||||
}
|
||||
//
|
||||
inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
|
||||
float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
|
||||
KaldiBlasInt *result) {
|
||||
sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
|
||||
}
|
||||
inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
|
||||
double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
|
||||
KaldiBlasInt *result) {
|
||||
dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
|
||||
}
|
||||
|
||||
//
|
||||
inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
|
||||
KaldiBlasInt *pivot, float *p_work,
|
||||
KaldiBlasInt *l_work, KaldiBlasInt *result) {
|
||||
sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
|
||||
}
|
||||
inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
|
||||
KaldiBlasInt *pivot, double *p_work,
|
||||
KaldiBlasInt *l_work, KaldiBlasInt *result) {
|
||||
dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
|
||||
}
|
||||
//
|
||||
inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
|
||||
KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
|
||||
float *sv, float *Vdata, KaldiBlasInt *vstride,
|
||||
float *Udata, KaldiBlasInt *ustride, float *p_work,
|
||||
KaldiBlasInt *l_work, KaldiBlasInt *result) {
|
||||
sgesvd_(v, u,
|
||||
num_cols, num_rows, Mdata, stride,
|
||||
sv, Vdata, vstride, Udata, ustride,
|
||||
p_work, l_work, result);
|
||||
}
|
||||
inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
|
||||
KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
|
||||
double *sv, double *Vdata, KaldiBlasInt *vstride,
|
||||
double *Udata, KaldiBlasInt *ustride, double *p_work,
|
||||
KaldiBlasInt *l_work, KaldiBlasInt *result) {
|
||||
dgesvd_(v, u,
|
||||
num_cols, num_rows, Mdata, stride,
|
||||
sv, Vdata, vstride, Udata, ustride,
|
||||
p_work, l_work, result);
|
||||
}
|
||||
//
|
||||
void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata,
|
||||
KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
|
||||
ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
|
||||
}
|
||||
void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata,
|
||||
KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
|
||||
dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
|
||||
}
|
||||
//
|
||||
void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata,
|
||||
KaldiBlasInt *ipiv, KaldiBlasInt *result) {
|
||||
ssptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
|
||||
}
|
||||
void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
|
||||
KaldiBlasInt *ipiv, KaldiBlasInt *result) {
|
||||
dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
|
||||
}
|
||||
#else
|
||||
inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
|
||||
float *Mdata, MatrixIndexT stride,
|
||||
int *pivot, int *result) {
|
||||
*result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
|
||||
Mdata, stride, pivot);
|
||||
}
|
||||
|
||||
inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
|
||||
double *Mdata, MatrixIndexT stride,
|
||||
int *pivot, int *result) {
|
||||
*result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
|
||||
Mdata, stride, pivot);
|
||||
}
|
||||
//
|
||||
inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) {
|
||||
return clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
|
||||
Mdata, stride);
|
||||
}
|
||||
|
||||
inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) {
|
||||
return clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
|
||||
Mdata, stride);
|
||||
}
|
||||
//
|
||||
inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride,
|
||||
int *pivot, int *result) {
|
||||
*result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
|
||||
}
|
||||
inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride,
|
||||
int *pivot, int *result) {
|
||||
*result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
// namespace kaldi
|
||||
|
||||
#endif
|
@ -1,876 +0,0 @@
|
||||
// matrix/compressed-matrix.cc
|
||||
|
||||
// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
|
||||
// Frantisek Skala, Wei Shi
|
||||
// 2015 Tom Ko
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "matrix/compressed-matrix.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
//static
|
||||
MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) {
|
||||
// Returns size in bytes of the data.
|
||||
DataFormat format = static_cast<DataFormat>(header.format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
return sizeof(GlobalHeader) +
|
||||
header.num_cols * (sizeof(PerColHeader) + header.num_rows);
|
||||
} else if (format == kTwoByte) {
|
||||
return sizeof(GlobalHeader) +
|
||||
2 * header.num_rows * header.num_cols;
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
return sizeof(GlobalHeader) +
|
||||
header.num_rows * header.num_cols;
|
||||
}
|
||||
}
|
||||
|
||||
// scale all element of matrix by scaling floats
|
||||
// in GlobalHeader with alpha.
|
||||
void CompressedMatrix::Scale(float alpha) {
|
||||
if (data_ != NULL) {
|
||||
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
|
||||
// scale the floating point values in each PerColHolder
|
||||
// and leave all integers the same.
|
||||
h->min_value *= alpha;
|
||||
h->range *= alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> // static inline
|
||||
void CompressedMatrix::ComputeGlobalHeader(
|
||||
const MatrixBase<Real> &mat, CompressionMethod method,
|
||||
GlobalHeader *header) {
|
||||
if (method == kAutomaticMethod) {
|
||||
if (mat.NumRows() > 8) method = kSpeechFeature;
|
||||
else method = kTwoByteAuto;
|
||||
}
|
||||
|
||||
switch (method) {
|
||||
case kSpeechFeature:
|
||||
header->format = static_cast<int32>(kOneByteWithColHeaders); // 1.
|
||||
break;
|
||||
case kTwoByteAuto: case kTwoByteSignedInteger:
|
||||
header->format = static_cast<int32>(kTwoByte); // 2.
|
||||
break;
|
||||
case kOneByteAuto: case kOneByteUnsignedInteger: case kOneByteZeroOne:
|
||||
header->format = static_cast<int32>(kOneByte); // 3.
|
||||
break;
|
||||
default:
|
||||
KALDI_ERR << "Invalid compression type: "
|
||||
<< static_cast<int32>(method);
|
||||
}
|
||||
|
||||
header->num_rows = mat.NumRows();
|
||||
header->num_cols = mat.NumCols();
|
||||
|
||||
// Now compute 'min_value' and 'range'.
|
||||
switch (method) {
|
||||
case kSpeechFeature: case kTwoByteAuto: case kOneByteAuto: {
|
||||
float min_value = mat.Min(), max_value = mat.Max();
|
||||
// ensure that max_value is strictly greater than min_value, even if matrix is
|
||||
// constant; this avoids crashes in ComputeColHeader when compressing speech
|
||||
// featupres.
|
||||
if (max_value == min_value)
|
||||
max_value = min_value + (1.0 + fabs(min_value));
|
||||
KALDI_ASSERT(min_value - min_value == 0 &&
|
||||
max_value - max_value == 0 &&
|
||||
"Cannot compress a matrix with Nan's or Inf's");
|
||||
|
||||
header->min_value = min_value;
|
||||
header->range = max_value - min_value;
|
||||
|
||||
// we previously checked that max_value != min_value, so their
|
||||
// difference should be nonzero.
|
||||
KALDI_ASSERT(header->range > 0.0);
|
||||
break;
|
||||
}
|
||||
case kTwoByteSignedInteger: {
|
||||
header->min_value = -32768.0;
|
||||
header->range = 65535.0;
|
||||
break;
|
||||
}
|
||||
case kOneByteUnsignedInteger: {
|
||||
header->min_value = 0.0;
|
||||
header->range = 255.0;
|
||||
break;
|
||||
}
|
||||
case kOneByteZeroOne: {
|
||||
header->min_value = 0.0;
|
||||
header->range = 1.0;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
KALDI_ERR << "Unknown compression method = "
|
||||
<< static_cast<int32>(method);
|
||||
}
|
||||
KALDI_COMPILE_TIME_ASSERT(sizeof(*header) == 20); // otherwise
|
||||
// something weird is happening and our code probably won't work or
|
||||
// won't be robust across platforms.
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyFromMat(
|
||||
const MatrixBase<Real> &mat, CompressionMethod method) {
|
||||
if (data_ != NULL) {
|
||||
delete [] static_cast<float*>(data_); // call delete [] because was allocated with new float[]
|
||||
data_ = NULL;
|
||||
}
|
||||
if (mat.NumRows() == 0) { return; } // Zero-size matrix stored as zero pointer.
|
||||
|
||||
|
||||
GlobalHeader global_header;
|
||||
ComputeGlobalHeader(mat, method, &global_header);
|
||||
|
||||
int32 data_size = DataSize(global_header);
|
||||
|
||||
data_ = AllocateData(data_size);
|
||||
|
||||
*(reinterpret_cast<GlobalHeader*>(data_)) = global_header;
|
||||
|
||||
DataFormat format = static_cast<DataFormat>(global_header.format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
PerColHeader *header_data =
|
||||
reinterpret_cast<PerColHeader*>(static_cast<char*>(data_) +
|
||||
sizeof(GlobalHeader));
|
||||
uint8 *byte_data =
|
||||
reinterpret_cast<uint8*>(header_data + global_header.num_cols);
|
||||
|
||||
const Real *matrix_data = mat.Data();
|
||||
|
||||
for (int32 col = 0; col < global_header.num_cols; col++) {
|
||||
CompressColumn(global_header,
|
||||
matrix_data + col, mat.Stride(),
|
||||
global_header.num_rows,
|
||||
header_data, byte_data);
|
||||
header_data++;
|
||||
byte_data += global_header.num_rows;
|
||||
}
|
||||
} else if (format == kTwoByte) {
|
||||
uint16 *data = reinterpret_cast<uint16*>(static_cast<char*>(data_) +
|
||||
sizeof(GlobalHeader));
|
||||
int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
|
||||
for (int32 r = 0; r < num_rows; r++) {
|
||||
const Real *row_data = mat.RowData(r);
|
||||
for (int32 c = 0; c < num_cols; c++)
|
||||
data[c] = FloatToUint16(global_header, row_data[c]);
|
||||
data += num_cols;
|
||||
}
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
uint8 *data = reinterpret_cast<uint8*>(static_cast<char*>(data_) +
|
||||
sizeof(GlobalHeader));
|
||||
int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
|
||||
for (int32 r = 0; r < num_rows; r++) {
|
||||
const Real *row_data = mat.RowData(r);
|
||||
for (int32 c = 0; c < num_cols; c++)
|
||||
data[c] = FloatToUint8(global_header, row_data[c]);
|
||||
data += num_cols;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Instantiate the template for float and double.
|
||||
template
|
||||
void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat,
|
||||
CompressionMethod method);
|
||||
|
||||
template
|
||||
void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat,
|
||||
CompressionMethod method);
|
||||
|
||||
|
||||
CompressedMatrix::CompressedMatrix(
|
||||
const CompressedMatrix &cmat,
|
||||
const MatrixIndexT row_offset,
|
||||
const MatrixIndexT num_rows,
|
||||
const MatrixIndexT col_offset,
|
||||
const MatrixIndexT num_cols,
|
||||
bool allow_padding): data_(NULL) {
|
||||
int32 old_num_rows = cmat.NumRows(), old_num_cols = cmat.NumCols();
|
||||
|
||||
if (old_num_rows == 0) {
|
||||
KALDI_ASSERT(num_rows == 0 && num_cols == 0);
|
||||
// The empty matrix is stored as a zero pointer.
|
||||
return;
|
||||
}
|
||||
|
||||
KALDI_ASSERT(row_offset < old_num_rows);
|
||||
KALDI_ASSERT(col_offset < old_num_cols);
|
||||
KALDI_ASSERT(row_offset >= 0 || allow_padding);
|
||||
KALDI_ASSERT(col_offset >= 0);
|
||||
KALDI_ASSERT(row_offset + num_rows <= old_num_rows || allow_padding);
|
||||
KALDI_ASSERT(col_offset + num_cols <= old_num_cols);
|
||||
|
||||
if (num_rows == 0 || num_cols == 0) { return; }
|
||||
|
||||
bool padding_is_used = (row_offset < 0 ||
|
||||
row_offset + num_rows > old_num_rows);
|
||||
|
||||
GlobalHeader new_global_header;
|
||||
KALDI_COMPILE_TIME_ASSERT(sizeof(new_global_header) == 20);
|
||||
|
||||
GlobalHeader *old_global_header = reinterpret_cast<GlobalHeader*>(cmat.Data());
|
||||
|
||||
new_global_header = *old_global_header;
|
||||
new_global_header.num_cols = num_cols;
|
||||
new_global_header.num_rows = num_rows;
|
||||
|
||||
// We don't switch format from 1 -> 2 (in case of size reduction) yet; if this
|
||||
// is needed, we will do this below by creating a temporary Matrix.
|
||||
new_global_header.format = old_global_header->format;
|
||||
|
||||
data_ = AllocateData(DataSize(new_global_header)); // allocate memory
|
||||
*(reinterpret_cast<GlobalHeader*>(data_)) = new_global_header;
|
||||
|
||||
|
||||
DataFormat format = static_cast<DataFormat>(old_global_header->format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
PerColHeader *old_per_col_header =
|
||||
reinterpret_cast<PerColHeader*>(old_global_header + 1);
|
||||
uint8 *old_byte_data =
|
||||
reinterpret_cast<uint8*>(old_per_col_header +
|
||||
old_global_header->num_cols);
|
||||
PerColHeader *new_per_col_header =
|
||||
reinterpret_cast<PerColHeader*>(
|
||||
reinterpret_cast<GlobalHeader*>(data_) + 1);
|
||||
|
||||
memcpy(new_per_col_header, old_per_col_header + col_offset,
|
||||
sizeof(PerColHeader) * num_cols);
|
||||
|
||||
uint8 *new_byte_data =
|
||||
reinterpret_cast<uint8*>(new_per_col_header + num_cols);
|
||||
if (!padding_is_used) {
|
||||
uint8 *old_start_of_subcol =
|
||||
old_byte_data + row_offset + (col_offset * old_num_rows),
|
||||
*new_start_of_col = new_byte_data;
|
||||
for (int32 i = 0; i < num_cols; i++) {
|
||||
memcpy(new_start_of_col, old_start_of_subcol, num_rows);
|
||||
new_start_of_col += num_rows;
|
||||
old_start_of_subcol += old_num_rows;
|
||||
}
|
||||
} else {
|
||||
uint8 *old_start_of_col =
|
||||
old_byte_data + (col_offset * old_num_rows),
|
||||
*new_start_of_col = new_byte_data;
|
||||
for (int32 i = 0; i < num_cols; i++) {
|
||||
|
||||
for (int32 j = 0; j < num_rows; j++) {
|
||||
int32 old_j = j + row_offset;
|
||||
if (old_j < 0) old_j = 0;
|
||||
else if (old_j >= old_num_rows) old_j = old_num_rows - 1;
|
||||
new_start_of_col[j] = old_start_of_col[old_j];
|
||||
}
|
||||
new_start_of_col += num_rows;
|
||||
old_start_of_col += old_num_rows;
|
||||
}
|
||||
}
|
||||
} else if (format == kTwoByte) {
|
||||
const uint16 *old_data =
|
||||
reinterpret_cast<const uint16*>(old_global_header + 1);
|
||||
uint16 *new_row_data =
|
||||
reinterpret_cast<uint16*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
|
||||
|
||||
for (int32 row = 0; row < num_rows; row++) {
|
||||
int32 old_row = row + row_offset;
|
||||
// The next two lines are only relevant if padding_is_used.
|
||||
if (old_row < 0) old_row = 0;
|
||||
else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
|
||||
const uint16 *old_row_data =
|
||||
old_data + col_offset + (old_num_cols * old_row);
|
||||
memcpy(new_row_data, old_row_data, sizeof(uint16) * num_cols);
|
||||
new_row_data += num_cols;
|
||||
}
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
const uint8 *old_data =
|
||||
reinterpret_cast<const uint8*>(old_global_header + 1);
|
||||
uint8 *new_row_data =
|
||||
reinterpret_cast<uint8*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
|
||||
|
||||
for (int32 row = 0; row < num_rows; row++) {
|
||||
int32 old_row = row + row_offset;
|
||||
// The next two lines are only relevant if padding_is_used.
|
||||
if (old_row < 0) old_row = 0;
|
||||
else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
|
||||
const uint8 *old_row_data =
|
||||
old_data + col_offset + (old_num_cols * old_row);
|
||||
memcpy(new_row_data, old_row_data, sizeof(uint8) * num_cols);
|
||||
new_row_data += num_cols;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_rows < 8 && format == kOneByteWithColHeaders) {
|
||||
// format was 1 but we want it to be 2 -> create a temporary
|
||||
// Matrix (uncompress), re-compress, and swap.
|
||||
// This gives us almost exact reconstruction while saving
|
||||
// memory (the elements take more space but there will be
|
||||
// no per-column headers).
|
||||
Matrix<float> temp(this->NumRows(), this->NumCols(),
|
||||
kUndefined);
|
||||
this->CopyToMat(&temp);
|
||||
CompressedMatrix temp_cmat(temp, kTwoByteAuto);
|
||||
this->Swap(&temp_cmat);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
CompressedMatrix &CompressedMatrix::operator =(const MatrixBase<Real> &mat) {
|
||||
this->CopyFromMat(mat);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Instantiate the template for float and double.
|
||||
template
|
||||
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<float> &mat);
|
||||
|
||||
template
|
||||
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<double> &mat);
|
||||
|
||||
inline uint16 CompressedMatrix::FloatToUint16(
|
||||
const GlobalHeader &global_header,
|
||||
float value) {
|
||||
float f = (value - global_header.min_value) /
|
||||
global_header.range;
|
||||
if (f > 1.0) f = 1.0; // Note: this should not happen.
|
||||
if (f < 0.0) f = 0.0; // Note: this should not happen.
|
||||
return static_cast<int>(f * 65535 + 0.499); // + 0.499 is to
|
||||
// round to closest int; avoids bias.
|
||||
}
|
||||
|
||||
|
||||
inline uint8 CompressedMatrix::FloatToUint8(
|
||||
const GlobalHeader &global_header,
|
||||
float value) {
|
||||
float f = (value - global_header.min_value) /
|
||||
global_header.range;
|
||||
if (f > 1.0) f = 1.0; // Note: this should not happen.
|
||||
if (f < 0.0) f = 0.0; // Note: this should not happen.
|
||||
return static_cast<int>(f * 255 + 0.499); // + 0.499 is to
|
||||
// round to closest int; avoids bias.
|
||||
}
|
||||
|
||||
|
||||
inline float CompressedMatrix::Uint16ToFloat(
|
||||
const GlobalHeader &global_header,
|
||||
uint16 value) {
|
||||
// the constant 1.52590218966964e-05 is 1/65535.
|
||||
return global_header.min_value
|
||||
+ global_header.range * 1.52590218966964e-05F * value;
|
||||
}
|
||||
|
||||
template<typename Real> // static
|
||||
void CompressedMatrix::ComputeColHeader(
|
||||
const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
int32 num_rows, CompressedMatrix::PerColHeader *header) {
|
||||
KALDI_ASSERT(num_rows > 0);
|
||||
std::vector<Real> sdata(num_rows); // the sorted data.
|
||||
for (size_t i = 0, size = sdata.size(); i < size; i++)
|
||||
sdata[i] = data[i*stride];
|
||||
|
||||
if (num_rows >= 5) {
|
||||
int quarter_nr = num_rows/4;
|
||||
// std::sort(sdata.begin(), sdata.end());
|
||||
// The elements at positions 0, quarter_nr,
|
||||
// 3*quarter_nr, and num_rows-1 need to be in sorted order.
|
||||
std::nth_element(sdata.begin(), sdata.begin() + quarter_nr, sdata.end());
|
||||
// Now, sdata.begin() + quarter_nr contains the element that would appear
|
||||
// in sorted order, in that position.
|
||||
std::nth_element(sdata.begin(), sdata.begin(), sdata.begin() + quarter_nr);
|
||||
// Now, sdata.begin() and sdata.begin() + quarter_nr contain the elements
|
||||
// that would appear at those positions in sorted order.
|
||||
std::nth_element(sdata.begin() + quarter_nr + 1,
|
||||
sdata.begin() + (3*quarter_nr), sdata.end());
|
||||
// Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
|
||||
// 3*quarter_nr, contain the elements that would appear at those positions
|
||||
// in sorted order.
|
||||
std::nth_element(sdata.begin() + (3*quarter_nr) + 1, sdata.end() - 1,
|
||||
sdata.end());
|
||||
// Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
|
||||
// 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear
|
||||
// at those positions in sorted order.
|
||||
|
||||
header->percentile_0 =
|
||||
std::min<uint16>(FloatToUint16(global_header, sdata[0]), 65532);
|
||||
header->percentile_25 =
|
||||
std::min<uint16>(
|
||||
std::max<uint16>(
|
||||
FloatToUint16(global_header, sdata[quarter_nr]),
|
||||
header->percentile_0 + static_cast<uint16>(1)), 65533);
|
||||
header->percentile_75 =
|
||||
std::min<uint16>(
|
||||
std::max<uint16>(
|
||||
FloatToUint16(global_header, sdata[3*quarter_nr]),
|
||||
header->percentile_25 + static_cast<uint16>(1)), 65534);
|
||||
header->percentile_100 = std::max<uint16>(
|
||||
FloatToUint16(global_header, sdata[num_rows-1]),
|
||||
header->percentile_75 + static_cast<uint16>(1));
|
||||
|
||||
} else { // handle this pathological case.
|
||||
std::sort(sdata.begin(), sdata.end());
|
||||
// Note: we know num_rows is at least 1.
|
||||
header->percentile_0 =
|
||||
std::min<uint16>(FloatToUint16(global_header, sdata[0]),
|
||||
65532);
|
||||
if (num_rows > 1)
|
||||
header->percentile_25 =
|
||||
std::min<uint16>(
|
||||
std::max<uint16>(FloatToUint16(global_header, sdata[1]),
|
||||
header->percentile_0 + 1), 65533);
|
||||
else
|
||||
header->percentile_25 = header->percentile_0 + 1;
|
||||
if (num_rows > 2)
|
||||
header->percentile_75 =
|
||||
std::min<uint16>(
|
||||
std::max<uint16>(FloatToUint16(global_header, sdata[2]),
|
||||
header->percentile_25 + 1), 65534);
|
||||
else
|
||||
header->percentile_75 = header->percentile_25 + 1;
|
||||
if (num_rows > 3)
|
||||
header->percentile_100 =
|
||||
std::max<uint16>(FloatToUint16(global_header, sdata[3]),
|
||||
header->percentile_75 + 1);
|
||||
else
|
||||
header->percentile_100 = header->percentile_75 + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
inline uint8 CompressedMatrix::FloatToChar(
|
||||
float p0, float p25, float p75, float p100,
|
||||
float value) {
|
||||
int ans;
|
||||
if (value < p25) { // range [ p0, p25 ) covered by
|
||||
// characters 0 .. 64. We round to the closest int.
|
||||
float f = (value - p0) / (p25 - p0);
|
||||
ans = static_cast<int>(f * 64 + 0.5);
|
||||
// Note: the checks on the next two lines
|
||||
// are necessary in pathological cases when all the elements in a row
|
||||
// are the same and the percentile_* values are separated by one.
|
||||
if (ans < 0) ans = 0;
|
||||
if (ans > 64) ans = 64;
|
||||
} else if (value < p75) { // range [ p25, p75 )covered
|
||||
// by characters 64 .. 192. We round to the closest int.
|
||||
float f = (value - p25) / (p75 - p25);
|
||||
ans = 64 + static_cast<int>(f * 128 + 0.5);
|
||||
if (ans < 64) ans = 64;
|
||||
if (ans > 192) ans = 192;
|
||||
} else { // range [ p75, p100 ] covered by
|
||||
// characters 192 .. 255. Note: this last range
|
||||
// has fewer characters than the left range, because
|
||||
// we go up to 255, not 256.
|
||||
float f = (value - p75) / (p100 - p75);
|
||||
ans = 192 + static_cast<int>(f * 63 + 0.5);
|
||||
if (ans < 192) ans = 192;
|
||||
if (ans > 255) ans = 255;
|
||||
}
|
||||
return static_cast<uint8>(ans);
|
||||
}
|
||||
|
||||
|
||||
// static
|
||||
inline float CompressedMatrix::CharToFloat(
|
||||
float p0, float p25, float p75, float p100,
|
||||
uint8 value) {
|
||||
if (value <= 64) {
|
||||
return p0 + (p25 - p0) * value * (1/64.0);
|
||||
} else if (value <= 192) {
|
||||
return p25 + (p75 - p25) * (value - 64) * (1/128.0);
|
||||
} else {
|
||||
return p75 + (p100 - p75) * (value - 192) * (1/63.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> // static
|
||||
void CompressedMatrix::CompressColumn(
|
||||
const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
int32 num_rows, CompressedMatrix::PerColHeader *header,
|
||||
uint8 *byte_data) {
|
||||
ComputeColHeader(global_header, data, stride,
|
||||
num_rows, header);
|
||||
|
||||
float p0 = Uint16ToFloat(global_header, header->percentile_0),
|
||||
p25 = Uint16ToFloat(global_header, header->percentile_25),
|
||||
p75 = Uint16ToFloat(global_header, header->percentile_75),
|
||||
p100 = Uint16ToFloat(global_header, header->percentile_100);
|
||||
|
||||
for (int32 i = 0; i < num_rows; i++) {
|
||||
Real this_data = data[i * stride];
|
||||
byte_data[i] = FloatToChar(p0, p25, p75, p100, this_data);
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
void* CompressedMatrix::AllocateData(int32 num_bytes) {
|
||||
KALDI_ASSERT(num_bytes > 0);
|
||||
KALDI_COMPILE_TIME_ASSERT(sizeof(float) == 4);
|
||||
// round size up to nearest number of floats.
|
||||
return reinterpret_cast<void*>(new float[(num_bytes/3) + 4]);
|
||||
}
|
||||
|
||||
void CompressedMatrix::Write(std::ostream &os, bool binary) const {
|
||||
if (binary) { // Binary-mode write:
|
||||
if (data_ != NULL) {
|
||||
GlobalHeader &h = *reinterpret_cast<GlobalHeader*>(data_);
|
||||
DataFormat format = static_cast<DataFormat>(h.format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
WriteToken(os, binary, "CM");
|
||||
} else if (format == kTwoByte) {
|
||||
WriteToken(os, binary, "CM2");
|
||||
} else if (format == kOneByte) {
|
||||
WriteToken(os, binary, "CM3");
|
||||
}
|
||||
MatrixIndexT size = DataSize(h); // total size of data in data_
|
||||
// We don't write out the "int32 format", hence the + 4, - 4.
|
||||
os.write(reinterpret_cast<const char*>(data_) + 4, size - 4);
|
||||
} else { // special case: where data_ == NULL, we treat it as an empty
|
||||
// matrix.
|
||||
WriteToken(os, binary, "CM");
|
||||
GlobalHeader h;
|
||||
h.range = h.min_value = 0.0;
|
||||
h.num_rows = h.num_cols = 0;
|
||||
os.write(reinterpret_cast<const char*>(&h), sizeof(h));
|
||||
}
|
||||
} else {
|
||||
// In text mode, just use the same format as a regular matrix.
|
||||
// This is not compressed.
|
||||
Matrix<BaseFloat> temp_mat(this->NumRows(), this->NumCols(),
|
||||
kUndefined);
|
||||
this->CopyToMat(&temp_mat);
|
||||
temp_mat.Write(os, binary);
|
||||
}
|
||||
if (os.fail())
|
||||
KALDI_ERR << "Error writing compressed matrix to stream.";
|
||||
}
|
||||
|
||||
void CompressedMatrix::Read(std::istream &is, bool binary) {
|
||||
if (data_ != NULL) {
|
||||
delete [] (static_cast<float*>(data_));
|
||||
data_ = NULL;
|
||||
}
|
||||
if (binary) {
|
||||
int peekval = Peek(is, binary);
|
||||
if (peekval == 'C') {
|
||||
std::string tok; // Should be CM (format 1) or CM2 (format 2)
|
||||
ReadToken(is, binary, &tok);
|
||||
GlobalHeader h;
|
||||
if (tok == "CM") { h.format = 1; } // kOneByteWithColHeaders
|
||||
else if (tok == "CM2") { h.format = 2; } // kTwoByte
|
||||
else if (tok == "CM3") { h.format = 3; } // kOneByte
|
||||
else {
|
||||
KALDI_ERR << "Unexpected token " << tok << ", expecting CM, CM2 or CM3";
|
||||
}
|
||||
// don't read the "format" -> hence + 4, - 4.
|
||||
is.read(reinterpret_cast<char*>(&h) + 4, sizeof(h) - 4);
|
||||
if (is.fail())
|
||||
KALDI_ERR << "Failed to read header";
|
||||
if (h.num_cols == 0) // empty matrix.
|
||||
return;
|
||||
int32 size = DataSize(h), remaining_size = size - sizeof(GlobalHeader);
|
||||
data_ = AllocateData(size);
|
||||
*(reinterpret_cast<GlobalHeader*>(data_)) = h;
|
||||
is.read(reinterpret_cast<char*>(data_) + sizeof(GlobalHeader),
|
||||
remaining_size);
|
||||
} else {
|
||||
// Assume that what we're reading is a regular Matrix. This might be the
|
||||
// case if you changed your code, making a Matrix into a CompressedMatrix,
|
||||
// and you want back-compatibility for reading.
|
||||
Matrix<BaseFloat> M;
|
||||
M.Read(is, binary); // This will crash if it was not a Matrix.
|
||||
this->CopyFromMat(M);
|
||||
}
|
||||
} else { // Text-mode read. In this case you don't get to
|
||||
// choose the compression type. Anyway this branch would only
|
||||
// be taken when debugging.
|
||||
Matrix<BaseFloat> temp;
|
||||
temp.Read(is, binary);
|
||||
this->CopyFromMat(temp);
|
||||
}
|
||||
if (is.fail())
|
||||
KALDI_ERR << "Failed to read data.";
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat,
|
||||
MatrixTransposeType trans) const {
|
||||
if (trans == kTrans) {
|
||||
Matrix<Real> temp(this->NumCols(), this->NumRows());
|
||||
CopyToMat(&temp, kNoTrans);
|
||||
mat->CopyFromMat(temp, kTrans);
|
||||
return;
|
||||
}
|
||||
|
||||
if (data_ == NULL) {
|
||||
KALDI_ASSERT(mat->NumRows() == 0);
|
||||
KALDI_ASSERT(mat->NumCols() == 0);
|
||||
return;
|
||||
}
|
||||
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
|
||||
int32 num_cols = h->num_cols, num_rows = h->num_rows;
|
||||
KALDI_ASSERT(mat->NumRows() == num_rows);
|
||||
KALDI_ASSERT(mat->NumCols() == num_cols);
|
||||
|
||||
DataFormat format = static_cast<DataFormat>(h->format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
|
||||
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
|
||||
h->num_cols);
|
||||
for (int32 i = 0; i < num_cols; i++, per_col_header++) {
|
||||
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
|
||||
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
|
||||
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
|
||||
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
|
||||
for (int32 j = 0; j < num_rows; j++, byte_data++) {
|
||||
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
|
||||
(*mat)(j, i) = f;
|
||||
}
|
||||
}
|
||||
} else if (format == kTwoByte) {
|
||||
const uint16 *data = reinterpret_cast<const uint16*>(h + 1);
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 65535.0);
|
||||
for (int32 i = 0; i < num_rows; i++) {
|
||||
Real *row_data = mat->RowData(i);
|
||||
for (int32 j = 0; j < num_cols; j++)
|
||||
row_data[j] = min_value + data[j] * increment;
|
||||
data += num_cols;
|
||||
}
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
float min_value = h->min_value, increment = h->range * (1.0 / 255.0);
|
||||
|
||||
const uint8 *data = reinterpret_cast<const uint8*>(h + 1);
|
||||
for (int32 i = 0; i < num_rows; i++) {
|
||||
Real *row_data = mat->RowData(i);
|
||||
for (int32 j = 0; j < num_cols; j++)
|
||||
row_data[j] = min_value + data[j] * increment;
|
||||
data += num_cols;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Instantiate the template for float and double.
|
||||
template
|
||||
void CompressedMatrix::CopyToMat(MatrixBase<float> *mat,
|
||||
MatrixTransposeType trans) const;
|
||||
template
|
||||
void CompressedMatrix::CopyToMat(MatrixBase<double> *mat,
|
||||
MatrixTransposeType trans) const;
|
||||
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyRowToVec(MatrixIndexT row,
|
||||
VectorBase<Real> *v) const {
|
||||
KALDI_ASSERT(row < this->NumRows());
|
||||
KALDI_ASSERT(row >= 0);
|
||||
KALDI_ASSERT(v->Dim() == this->NumCols());
|
||||
|
||||
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
|
||||
DataFormat format = static_cast<DataFormat>(h->format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
|
||||
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
|
||||
h->num_cols);
|
||||
byte_data += row; // point to first value we are interested in
|
||||
for (int32 i = 0; i < h->num_cols;
|
||||
i++, per_col_header++, byte_data += h->num_rows) {
|
||||
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
|
||||
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
|
||||
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
|
||||
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
|
||||
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
|
||||
(*v)(i) = f;
|
||||
}
|
||||
} else if (format == kTwoByte) {
|
||||
int32 num_cols = h->num_cols;
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 65535.0);
|
||||
const uint16 *row_data = reinterpret_cast<uint16*>(h + 1) + (num_cols * row);
|
||||
Real *v_data = v->Data();
|
||||
for (int32 c = 0; c < num_cols; c++)
|
||||
v_data[c] = min_value + row_data[c] * increment;
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
int32 num_cols = h->num_cols;
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 255.0);
|
||||
const uint8 *row_data = reinterpret_cast<uint8*>(h + 1) + (num_cols * row);
|
||||
Real *v_data = v->Data();
|
||||
for (int32 c = 0; c < num_cols; c++)
|
||||
v_data[c] = min_value + row_data[c] * increment;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyColToVec(MatrixIndexT col,
|
||||
VectorBase<Real> *v) const {
|
||||
KALDI_ASSERT(col < this->NumCols());
|
||||
KALDI_ASSERT(col >= 0);
|
||||
KALDI_ASSERT(v->Dim() == this->NumRows());
|
||||
|
||||
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
|
||||
|
||||
DataFormat format = static_cast<DataFormat>(h->format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
|
||||
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
|
||||
h->num_cols);
|
||||
byte_data += col*h->num_rows; // point to first value in the column we want
|
||||
per_col_header += col;
|
||||
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
|
||||
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
|
||||
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
|
||||
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
|
||||
for (int32 i = 0; i < h->num_rows; i++, byte_data++) {
|
||||
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
|
||||
(*v)(i) = f;
|
||||
}
|
||||
} else if (format == kTwoByte) {
|
||||
int32 num_rows = h->num_rows, num_cols = h->num_cols;
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 65535.0);
|
||||
const uint16 *col_data = reinterpret_cast<uint16*>(h + 1) + col;
|
||||
Real *v_data = v->Data();
|
||||
for (int32 r = 0; r < num_rows; r++)
|
||||
v_data[r] = min_value + increment * col_data[r * num_cols];
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
int32 num_rows = h->num_rows, num_cols = h->num_cols;
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 255.0);
|
||||
const uint8 *col_data = reinterpret_cast<uint8*>(h + 1) + col;
|
||||
Real *v_data = v->Data();
|
||||
for (int32 r = 0; r < num_rows; r++)
|
||||
v_data[r] = min_value + increment * col_data[r * num_cols];
|
||||
}
|
||||
}
|
||||
|
||||
// instantiate the templates.
|
||||
template void
|
||||
CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase<double> *) const;
|
||||
template void
|
||||
CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase<float> *) const;
|
||||
template void
|
||||
CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase<double> *) const;
|
||||
template void
|
||||
CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase<float> *) const;
|
||||
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyToMat(int32 row_offset,
|
||||
int32 col_offset,
|
||||
MatrixBase<Real> *dest) const {
|
||||
KALDI_PARANOID_ASSERT(row_offset < this->NumRows());
|
||||
KALDI_PARANOID_ASSERT(col_offset < this->NumCols());
|
||||
KALDI_PARANOID_ASSERT(row_offset >= 0);
|
||||
KALDI_PARANOID_ASSERT(col_offset >= 0);
|
||||
KALDI_ASSERT(row_offset+dest->NumRows() <= this->NumRows());
|
||||
KALDI_ASSERT(col_offset+dest->NumCols() <= this->NumCols());
|
||||
// everything is OK
|
||||
GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
|
||||
int32 num_rows = h->num_rows, num_cols = h->num_cols,
|
||||
tgt_cols = dest->NumCols(), tgt_rows = dest->NumRows();
|
||||
|
||||
DataFormat format = static_cast<DataFormat>(h->format);
|
||||
if (format == kOneByteWithColHeaders) {
|
||||
PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
|
||||
uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
|
||||
h->num_cols);
|
||||
|
||||
uint8 *start_of_subcol = byte_data+row_offset; // skip appropriate
|
||||
// number of columns
|
||||
start_of_subcol += col_offset*num_rows; // skip appropriate number of rows
|
||||
|
||||
per_col_header += col_offset; // skip the appropriate number of headers
|
||||
|
||||
for (int32 i = 0;
|
||||
i < tgt_cols;
|
||||
i++, per_col_header++, start_of_subcol+=num_rows) {
|
||||
byte_data = start_of_subcol;
|
||||
float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
|
||||
p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
|
||||
p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
|
||||
p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
|
||||
for (int32 j = 0; j < tgt_rows; j++, byte_data++) {
|
||||
float f = CharToFloat(p0, p25, p75, p100, *byte_data);
|
||||
(*dest)(j, i) = f;
|
||||
}
|
||||
}
|
||||
} else if (format == kTwoByte) {
|
||||
const uint16 *data = reinterpret_cast<const uint16*>(h+1) + col_offset +
|
||||
(num_cols * row_offset);
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 65535.0);
|
||||
|
||||
for (int32 row = 0; row < tgt_rows; row++) {
|
||||
Real *dest_row = dest->RowData(row);
|
||||
for (int32 col = 0; col < tgt_cols; col++)
|
||||
dest_row[col] = min_value + increment * data[col];
|
||||
data += num_cols;
|
||||
}
|
||||
} else {
|
||||
KALDI_ASSERT(format == kOneByte);
|
||||
const uint8 *data = reinterpret_cast<const uint8*>(h+1) + col_offset +
|
||||
(num_cols * row_offset);
|
||||
float min_value = h->min_value,
|
||||
increment = h->range * (1.0 / 255.0);
|
||||
for (int32 row = 0; row < tgt_rows; row++) {
|
||||
Real *dest_row = dest->RowData(row);
|
||||
for (int32 col = 0; col < tgt_cols; col++)
|
||||
dest_row[col] = min_value + increment * data[col];
|
||||
data += num_cols;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// instantiate the templates.
|
||||
template void CompressedMatrix::CopyToMat(int32,
|
||||
int32,
|
||||
MatrixBase<float> *dest) const;
|
||||
template void CompressedMatrix::CopyToMat(int32,
|
||||
int32,
|
||||
MatrixBase<double> *dest) const;
|
||||
|
||||
void CompressedMatrix::Clear() {
|
||||
if (data_ != NULL) {
|
||||
delete [] static_cast<float*>(data_);
|
||||
data_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
CompressedMatrix::CompressedMatrix(const CompressedMatrix &mat): data_(NULL) {
|
||||
*this = mat; // use assignment operator.
|
||||
}
|
||||
|
||||
CompressedMatrix &CompressedMatrix::operator = (const CompressedMatrix &mat) {
|
||||
Clear(); // now this->data_ == NULL.
|
||||
if (mat.data_ != NULL) {
|
||||
MatrixIndexT data_size = DataSize(*static_cast<GlobalHeader*>(mat.data_));
|
||||
data_ = AllocateData(data_size);
|
||||
memcpy(static_cast<void*>(data_),
|
||||
static_cast<void*>(mat.data_),
|
||||
data_size);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
@ -1,283 +0,0 @@
|
||||
// matrix/compressed-matrix.h
|
||||
|
||||
// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
|
||||
// Frantisek Skala, Wei Shi
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_MATRIX_COMPRESSED_MATRIX_H_
|
||||
#define KALDI_MATRIX_COMPRESSED_MATRIX_H_ 1
|
||||
|
||||
#include "matrix/kaldi-matrix.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/// \addtogroup matrix_group
|
||||
/// @{
|
||||
|
||||
|
||||
|
||||
/*
|
||||
The enum CompressionMethod is used when creating a CompressedMatrix (a lossily
|
||||
compressed matrix) from a regular Matrix. It dictates how we choose the
|
||||
compressed format and how we choose the ranges of floats that are represented
|
||||
by particular integers.
|
||||
|
||||
kAutomaticMethod = 1 This is the default when you don't specify the
|
||||
compression method. It is a shorthand for using
|
||||
kSpeechFeature if the num-rows is more than 8, and
|
||||
kTwoByteAuto otherwise.
|
||||
kSpeechFeature = 2 This is the most complicated of the compression methods,
|
||||
and was designed for speech features which have a roughly
|
||||
Gaussian distribution with different ranges for each
|
||||
dimension. Each element is stored in one byte, but there
|
||||
is an 8-byte header per column; the spacing of the
|
||||
integer values is not uniform but is in 3 ranges.
|
||||
kTwoByteAuto = 3 Each element is stored in two bytes as a uint16, with
|
||||
the representable range of values chosen automatically
|
||||
with the minimum and maximum elements of the matrix as
|
||||
its edges.
|
||||
kTwoByteSignedInteger = 4
|
||||
Each element is stored in two bytes as a uint16, with
|
||||
the representable range of value chosen to coincide with
|
||||
what you'd get if you stored signed integers, i.e.
|
||||
[-32768.0, 32767.0]. Suitable for waveform data that
|
||||
was previously stored as 16-bit PCM.
|
||||
kOneByteAuto = 5 Each element is stored in one byte as a uint8, with the
|
||||
representable range of values chosen automatically with
|
||||
the minimum and maximum elements of the matrix as its
|
||||
edges.
|
||||
kOneByteUnsignedInteger = 6 Each element is stored in
|
||||
one byte as a uint8, with the representable range of
|
||||
values equal to [0.0, 255.0].
|
||||
kOneByteZeroOne = 7 Each element is stored in
|
||||
one byte as a uint8, with the representable range of
|
||||
values equal to [0.0, 1.0]. Suitable for image data
|
||||
that has previously been compressed as int8.
|
||||
|
||||
// We can add new methods here as needed: if they just imply different ways
|
||||
// of selecting the min_value and range, and a num-bytes = 1 or 2, they will
|
||||
// be trivial to implement.
|
||||
*/
|
||||
enum CompressionMethod {
|
||||
kAutomaticMethod = 1,
|
||||
kSpeechFeature = 2,
|
||||
kTwoByteAuto = 3,
|
||||
kTwoByteSignedInteger = 4,
|
||||
kOneByteAuto = 5,
|
||||
kOneByteUnsignedInteger = 6,
|
||||
kOneByteZeroOne = 7
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
This class does lossy compression of a matrix. It supports various compression
|
||||
methods, see enum CompressionMethod.
|
||||
*/
|
||||
|
||||
class CompressedMatrix {
|
||||
public:
|
||||
CompressedMatrix(): data_(NULL) { }
|
||||
|
||||
~CompressedMatrix() { Clear(); }
|
||||
|
||||
template<typename Real>
|
||||
explicit CompressedMatrix(const MatrixBase<Real> &mat,
|
||||
CompressionMethod method = kAutomaticMethod):
|
||||
data_(NULL) { CopyFromMat(mat, method); }
|
||||
|
||||
/// Initializer that can be used to select part of an existing
|
||||
/// CompressedMatrix without un-compressing and re-compressing (note: unlike
|
||||
/// similar initializers for class Matrix, it doesn't point to the same memory
|
||||
/// location).
|
||||
///
|
||||
/// This creates a CompressedMatrix with the size (num_rows, num_cols)
|
||||
/// starting at (row_offset, col_offset).
|
||||
///
|
||||
/// If you specify allow_padding = true,
|
||||
/// it is permitted to have row_offset < 0 and
|
||||
/// row_offset + num_rows > mat.NumRows(), and the result will contain
|
||||
/// repeats of the first and last rows of 'mat' as necessary.
|
||||
CompressedMatrix(const CompressedMatrix &mat,
|
||||
const MatrixIndexT row_offset,
|
||||
const MatrixIndexT num_rows,
|
||||
const MatrixIndexT col_offset,
|
||||
const MatrixIndexT num_cols,
|
||||
bool allow_padding = false);
|
||||
|
||||
void *Data() const { return this->data_; }
|
||||
|
||||
/// This will resize *this and copy the contents of mat to *this.
|
||||
template<typename Real>
|
||||
void CopyFromMat(const MatrixBase<Real> &mat,
|
||||
CompressionMethod method = kAutomaticMethod);
|
||||
|
||||
CompressedMatrix(const CompressedMatrix &mat);
|
||||
|
||||
CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
|
||||
|
||||
template<typename Real>
|
||||
CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
|
||||
|
||||
/// Copies contents to matrix. Note: mat must have the correct size.
|
||||
/// The kTrans case uses a temporary.
|
||||
template<typename Real>
|
||||
void CopyToMat(MatrixBase<Real> *mat,
|
||||
MatrixTransposeType trans = kNoTrans) const;
|
||||
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
|
||||
void Read(std::istream &is, bool binary);
|
||||
|
||||
/// Returns number of rows (or zero for emtpy matrix).
|
||||
inline MatrixIndexT NumRows() const { return (data_ == NULL) ? 0 :
|
||||
(*reinterpret_cast<GlobalHeader*>(data_)).num_rows; }
|
||||
|
||||
/// Returns number of columns (or zero for emtpy matrix).
|
||||
inline MatrixIndexT NumCols() const { return (data_ == NULL) ? 0 :
|
||||
(*reinterpret_cast<GlobalHeader*>(data_)).num_cols; }
|
||||
|
||||
/// Copies row #row of the matrix into vector v.
|
||||
/// Note: v must have same size as #cols.
|
||||
template<typename Real>
|
||||
void CopyRowToVec(MatrixIndexT row, VectorBase<Real> *v) const;
|
||||
|
||||
/// Copies column #col of the matrix into vector v.
|
||||
/// Note: v must have same size as #rows.
|
||||
template<typename Real>
|
||||
void CopyColToVec(MatrixIndexT col, VectorBase<Real> *v) const;
|
||||
|
||||
/// Copies submatrix of compressed matrix into matrix dest.
|
||||
/// Submatrix starts at row row_offset and column column_offset and its size
|
||||
/// is defined by size of provided matrix dest
|
||||
template<typename Real>
|
||||
void CopyToMat(int32 row_offset,
|
||||
int32 column_offset,
|
||||
MatrixBase<Real> *dest) const;
|
||||
|
||||
void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); }
|
||||
|
||||
void Clear();
|
||||
|
||||
/// scales all elements of matrix by alpha.
|
||||
/// It scales the floating point values in GlobalHeader by alpha.
|
||||
void Scale(float alpha);
|
||||
|
||||
friend class Matrix<float>;
|
||||
friend class Matrix<double>;
|
||||
private:
|
||||
|
||||
// This enum describes the different compressed-data formats: these are
|
||||
// distinct from the compression methods although all of the methods apart
|
||||
// from kAutomaticMethod dictate a particular compressed-data format.
|
||||
//
|
||||
// kOneByteWithColHeaders means there is a GlobalHeader and each
|
||||
// column has a PerColHeader; the actual data is stored in
|
||||
// one byte per element, in column-major order (the mapping
|
||||
// from integers to floats is a little complicated).
|
||||
// kTwoByte means there is a global header but no PerColHeader;
|
||||
// the actual data is stored in two bytes per element in
|
||||
// row-major order; it's decompressed as:
|
||||
// uint16 i; GlobalHeader g;
|
||||
// float f = g.min_value + i * (g.range / 65535.0)
|
||||
// kOneByte means there is a global header but not PerColHeader;
|
||||
// the data is stored in one byte per element in row-major
|
||||
// order and is decompressed as:
|
||||
// uint8 i; GlobalHeader g;
|
||||
// float f = g.min_value + i * (g.range / 255.0)
|
||||
enum DataFormat {
|
||||
kOneByteWithColHeaders = 1,
|
||||
kTwoByte = 2,
|
||||
kOneByte = 3
|
||||
};
|
||||
|
||||
|
||||
// allocates data using new [], ensures byte alignment
|
||||
// sufficient for float.
|
||||
static void *AllocateData(int32 num_bytes);
|
||||
|
||||
struct GlobalHeader {
|
||||
int32 format; // Represents the enum DataFormat.
|
||||
float min_value; // min_value and range represent the ranges of the integer
|
||||
// data in the kTwoByte and kOneByte formats, and the
|
||||
// range of the PerColHeader uint16's in the
|
||||
// kOneByteWithColheaders format.
|
||||
float range;
|
||||
int32 num_rows;
|
||||
int32 num_cols;
|
||||
};
|
||||
|
||||
// This function computes the global header for compressing this data.
|
||||
template<typename Real>
|
||||
static inline void ComputeGlobalHeader(const MatrixBase<Real> &mat,
|
||||
CompressionMethod method,
|
||||
GlobalHeader *header);
|
||||
|
||||
|
||||
// The number of bytes we need to request when allocating 'data_'.
|
||||
static MatrixIndexT DataSize(const GlobalHeader &header);
|
||||
|
||||
// This struct is only used in format kOneByteWithColHeaders.
|
||||
struct PerColHeader {
|
||||
uint16 percentile_0;
|
||||
uint16 percentile_25;
|
||||
uint16 percentile_75;
|
||||
uint16 percentile_100;
|
||||
};
|
||||
|
||||
template<typename Real>
|
||||
static void CompressColumn(const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
int32 num_rows, PerColHeader *header,
|
||||
uint8 *byte_data);
|
||||
template<typename Real>
|
||||
static void ComputeColHeader(const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
int32 num_rows, PerColHeader *header);
|
||||
|
||||
static inline uint16 FloatToUint16(const GlobalHeader &global_header,
|
||||
float value);
|
||||
|
||||
// this is used only in the kOneByte compression format.
|
||||
static inline uint8 FloatToUint8(const GlobalHeader &global_header,
|
||||
float value);
|
||||
|
||||
static inline float Uint16ToFloat(const GlobalHeader &global_header,
|
||||
uint16 value);
|
||||
|
||||
// this is used only in the kOneByteWithColHeaders compression format.
|
||||
static inline uint8 FloatToChar(float p0, float p25,
|
||||
float p75, float p100,
|
||||
float value);
|
||||
|
||||
// this is used only in the kOneByteWithColHeaders compression format.
|
||||
static inline float CharToFloat(float p0, float p25,
|
||||
float p75, float p100,
|
||||
uint8 value);
|
||||
|
||||
void *data_; // first GlobalHeader, then PerColHeader (repeated), then
|
||||
// the byte data for each column (repeated). Note: don't intersperse
|
||||
// the byte data with the PerColHeaders, because of alignment issues.
|
||||
|
||||
};
|
||||
|
||||
/// @} end of \addtogroup matrix_group
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_MATRIX_COMPRESSED_MATRIX_H_
|
@ -1,924 +0,0 @@
|
||||
// matrix/jama-eig.h
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This file consists of a port and modification of materials from
|
||||
// JAMA: A Java Matrix Package
|
||||
// under the following notice: This software is a cooperative product of
|
||||
// The MathWorks and the National Institute of Standards and Technology (NIST)
|
||||
// which has been released to the public. This notice and the original code are
|
||||
// available at http://math.nist.gov/javanumerics/jama/domain.notice
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_MATRIX_JAMA_EIG_H_
|
||||
#define KALDI_MATRIX_JAMA_EIG_H_ 1
|
||||
|
||||
#include "matrix/kaldi-matrix.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
// This class is not to be used externally. See the Eig function in the Matrix
|
||||
// class in kaldi-matrix.h. This is the external interface.
|
||||
|
||||
template<typename Real> class EigenvalueDecomposition {
|
||||
// This class is based on the EigenvalueDecomposition class from the JAMA
|
||||
// library (version 1.0.2).
|
||||
public:
|
||||
EigenvalueDecomposition(const MatrixBase<Real> &A);
|
||||
|
||||
~EigenvalueDecomposition(); // free memory.
|
||||
|
||||
void GetV(MatrixBase<Real> *V_out) { // V is what we call P externally; it's the matrix of
|
||||
// eigenvectors.
|
||||
KALDI_ASSERT(V_out->NumRows() == static_cast<MatrixIndexT>(n_)
|
||||
&& V_out->NumCols() == static_cast<MatrixIndexT>(n_));
|
||||
for (int i = 0; i < n_; i++)
|
||||
for (int j = 0; j < n_; j++)
|
||||
(*V_out)(i, j) = V(i, j); // V(i, j) is member function.
|
||||
}
|
||||
void GetRealEigenvalues(VectorBase<Real> *r_out) {
|
||||
// returns real part of eigenvalues.
|
||||
KALDI_ASSERT(r_out->Dim() == static_cast<MatrixIndexT>(n_));
|
||||
for (int i = 0; i < n_; i++)
|
||||
(*r_out)(i) = d_[i];
|
||||
}
|
||||
void GetImagEigenvalues(VectorBase<Real> *i_out) {
|
||||
// returns imaginary part of eigenvalues.
|
||||
KALDI_ASSERT(i_out->Dim() == static_cast<MatrixIndexT>(n_));
|
||||
for (int i = 0; i < n_; i++)
|
||||
(*i_out)(i) = e_[i];
|
||||
}
|
||||
private:
|
||||
|
||||
inline Real &H(int r, int c) { return H_[r*n_ + c]; }
|
||||
inline Real &V(int r, int c) { return V_[r*n_ + c]; }
|
||||
|
||||
// complex division
|
||||
inline static void cdiv(Real xr, Real xi, Real yr, Real yi, Real *cdivr, Real *cdivi) {
|
||||
Real r, d;
|
||||
if (std::abs(yr) > std::abs(yi)) {
|
||||
r = yi/yr;
|
||||
d = yr + r*yi;
|
||||
*cdivr = (xr + r*xi)/d;
|
||||
*cdivi = (xi - r*xr)/d;
|
||||
} else {
|
||||
r = yr/yi;
|
||||
d = yi + r*yr;
|
||||
*cdivr = (r*xr + xi)/d;
|
||||
*cdivi = (r*xi - xr)/d;
|
||||
}
|
||||
}
|
||||
|
||||
// Nonsymmetric reduction from Hessenberg to real Schur form.
|
||||
void Hqr2 ();
|
||||
|
||||
|
||||
int n_; // matrix dimension.
|
||||
|
||||
Real *d_, *e_; // real and imaginary parts of eigenvalues.
|
||||
Real *V_; // the eigenvectors (P in our external notation)
|
||||
Real *H_; // the nonsymmetric Hessenberg form.
|
||||
Real *ort_; // working storage for nonsymmetric algorithm.
|
||||
|
||||
// Symmetric Householder reduction to tridiagonal form.
|
||||
void Tred2 ();
|
||||
|
||||
// Symmetric tridiagonal QL algorithm.
|
||||
void Tql2 ();
|
||||
|
||||
// Nonsymmetric reduction to Hessenberg form.
|
||||
void Orthes ();
|
||||
|
||||
};
|
||||
|
||||
template class EigenvalueDecomposition<float>; // force instantiation.
|
||||
template class EigenvalueDecomposition<double>; // force instantiation.
|
||||
|
||||
template<typename Real> void EigenvalueDecomposition<Real>::Tred2() {
|
||||
// This is derived from the Algol procedures tred2 by
|
||||
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
|
||||
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
|
||||
// Fortran subroutine in EISPACK.
|
||||
|
||||
for (int j = 0; j < n_; j++) {
|
||||
d_[j] = V(n_-1, j);
|
||||
}
|
||||
|
||||
// Householder reduction to tridiagonal form.
|
||||
|
||||
for (int i = n_-1; i > 0; i--) {
|
||||
|
||||
// Scale to avoid under/overflow.
|
||||
|
||||
Real scale = 0.0;
|
||||
Real h = 0.0;
|
||||
for (int k = 0; k < i; k++) {
|
||||
scale = scale + std::abs(d_[k]);
|
||||
}
|
||||
if (scale == 0.0) {
|
||||
e_[i] = d_[i-1];
|
||||
for (int j = 0; j < i; j++) {
|
||||
d_[j] = V(i-1, j);
|
||||
V(i, j) = 0.0;
|
||||
V(j, i) = 0.0;
|
||||
}
|
||||
} else {
|
||||
|
||||
// Generate Householder vector.
|
||||
|
||||
for (int k = 0; k < i; k++) {
|
||||
d_[k] /= scale;
|
||||
h += d_[k] * d_[k];
|
||||
}
|
||||
Real f = d_[i-1];
|
||||
Real g = std::sqrt(h);
|
||||
if (f > 0) {
|
||||
g = -g;
|
||||
}
|
||||
e_[i] = scale * g;
|
||||
h = h - f * g;
|
||||
d_[i-1] = f - g;
|
||||
for (int j = 0; j < i; j++) {
|
||||
e_[j] = 0.0;
|
||||
}
|
||||
|
||||
// Apply similarity transformation to remaining columns.
|
||||
|
||||
for (int j = 0; j < i; j++) {
|
||||
f = d_[j];
|
||||
V(j, i) = f;
|
||||
g =e_[j] + V(j, j) * f;
|
||||
for (int k = j+1; k <= i-1; k++) {
|
||||
g += V(k, j) * d_[k];
|
||||
e_[k] += V(k, j) * f;
|
||||
}
|
||||
e_[j] = g;
|
||||
}
|
||||
f = 0.0;
|
||||
for (int j = 0; j < i; j++) {
|
||||
e_[j] /= h;
|
||||
f += e_[j] * d_[j];
|
||||
}
|
||||
Real hh = f / (h + h);
|
||||
for (int j = 0; j < i; j++) {
|
||||
e_[j] -= hh * d_[j];
|
||||
}
|
||||
for (int j = 0; j < i; j++) {
|
||||
f = d_[j];
|
||||
g = e_[j];
|
||||
for (int k = j; k <= i-1; k++) {
|
||||
V(k, j) -= (f * e_[k] + g * d_[k]);
|
||||
}
|
||||
d_[j] = V(i-1, j);
|
||||
V(i, j) = 0.0;
|
||||
}
|
||||
}
|
||||
d_[i] = h;
|
||||
}
|
||||
|
||||
// Accumulate transformations.
|
||||
|
||||
for (int i = 0; i < n_-1; i++) {
|
||||
V(n_-1, i) = V(i, i);
|
||||
V(i, i) = 1.0;
|
||||
Real h = d_[i+1];
|
||||
if (h != 0.0) {
|
||||
for (int k = 0; k <= i; k++) {
|
||||
d_[k] = V(k, i+1) / h;
|
||||
}
|
||||
for (int j = 0; j <= i; j++) {
|
||||
Real g = 0.0;
|
||||
for (int k = 0; k <= i; k++) {
|
||||
g += V(k, i+1) * V(k, j);
|
||||
}
|
||||
for (int k = 0; k <= i; k++) {
|
||||
V(k, j) -= g * d_[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int k = 0; k <= i; k++) {
|
||||
V(k, i+1) = 0.0;
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < n_; j++) {
|
||||
d_[j] = V(n_-1, j);
|
||||
V(n_-1, j) = 0.0;
|
||||
}
|
||||
V(n_-1, n_-1) = 1.0;
|
||||
e_[0] = 0.0;
|
||||
}
|
||||
|
||||
template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
|
||||
// This is derived from the Algol procedures tql2, by
|
||||
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
|
||||
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
|
||||
// Fortran subroutine in EISPACK.
|
||||
|
||||
for (int i = 1; i < n_; i++) {
|
||||
e_[i-1] = e_[i];
|
||||
}
|
||||
e_[n_-1] = 0.0;
|
||||
|
||||
Real f = 0.0;
|
||||
Real tst1 = 0.0;
|
||||
Real eps = std::numeric_limits<Real>::epsilon();
|
||||
for (int l = 0; l < n_; l++) {
|
||||
|
||||
// Find small subdiagonal element
|
||||
|
||||
tst1 = std::max(tst1, std::abs(d_[l]) + std::abs(e_[l]));
|
||||
int m = l;
|
||||
while (m < n_) {
|
||||
if (std::abs(e_[m]) <= eps*tst1) {
|
||||
break;
|
||||
}
|
||||
m++;
|
||||
}
|
||||
|
||||
// If m == l, d_[l] is an eigenvalue,
|
||||
// otherwise, iterate.
|
||||
|
||||
if (m > l) {
|
||||
int iter = 0;
|
||||
do {
|
||||
iter = iter + 1; // (Could check iteration count here.)
|
||||
|
||||
// Compute implicit shift
|
||||
|
||||
Real g = d_[l];
|
||||
Real p = (d_[l+1] - g) / (2.0 *e_[l]);
|
||||
Real r = Hypot(p, static_cast<Real>(1.0)); // This is a Kaldi version of hypot that works with templates.
|
||||
if (p < 0) {
|
||||
r = -r;
|
||||
}
|
||||
d_[l] =e_[l] / (p + r);
|
||||
d_[l+1] =e_[l] * (p + r);
|
||||
Real dl1 = d_[l+1];
|
||||
Real h = g - d_[l];
|
||||
for (int i = l+2; i < n_; i++) {
|
||||
d_[i] -= h;
|
||||
}
|
||||
f = f + h;
|
||||
|
||||
// Implicit QL transformation.
|
||||
|
||||
p = d_[m];
|
||||
Real c = 1.0;
|
||||
Real c2 = c;
|
||||
Real c3 = c;
|
||||
Real el1 =e_[l+1];
|
||||
Real s = 0.0;
|
||||
Real s2 = 0.0;
|
||||
for (int i = m-1; i >= l; i--) {
|
||||
c3 = c2;
|
||||
c2 = c;
|
||||
s2 = s;
|
||||
g = c *e_[i];
|
||||
h = c * p;
|
||||
r = Hypot(p, e_[i]); // This is a Kaldi version of Hypot that works with templates.
|
||||
e_[i+1] = s * r;
|
||||
s =e_[i] / r;
|
||||
c = p / r;
|
||||
p = c * d_[i] - s * g;
|
||||
d_[i+1] = h + s * (c * g + s * d_[i]);
|
||||
|
||||
// Accumulate transformation.
|
||||
|
||||
for (int k = 0; k < n_; k++) {
|
||||
h = V(k, i+1);
|
||||
V(k, i+1) = s * V(k, i) + c * h;
|
||||
V(k, i) = c * V(k, i) - s * h;
|
||||
}
|
||||
}
|
||||
p = -s * s2 * c3 * el1 *e_[l] / dl1;
|
||||
e_[l] = s * p;
|
||||
d_[l] = c * p;
|
||||
|
||||
// Check for convergence.
|
||||
|
||||
} while (std::abs(e_[l]) > eps*tst1);
|
||||
}
|
||||
d_[l] = d_[l] + f;
|
||||
e_[l] = 0.0;
|
||||
}
|
||||
|
||||
// Sort eigenvalues and corresponding vectors.
|
||||
|
||||
for (int i = 0; i < n_-1; i++) {
|
||||
int k = i;
|
||||
Real p = d_[i];
|
||||
for (int j = i+1; j < n_; j++) {
|
||||
if (d_[j] < p) {
|
||||
k = j;
|
||||
p = d_[j];
|
||||
}
|
||||
}
|
||||
if (k != i) {
|
||||
d_[k] = d_[i];
|
||||
d_[i] = p;
|
||||
for (int j = 0; j < n_; j++) {
|
||||
p = V(j, i);
|
||||
V(j, i) = V(j, k);
|
||||
V(j, k) = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void EigenvalueDecomposition<Real>::Orthes() {
|
||||
|
||||
// This is derived from the Algol procedures orthes and ortran,
|
||||
// by Martin and Wilkinson, Handbook for Auto. Comp.,
|
||||
// Vol.ii-Linear Algebra, and the corresponding
|
||||
// Fortran subroutines in EISPACK.
|
||||
|
||||
int low = 0;
|
||||
int high = n_-1;
|
||||
|
||||
for (int m = low+1; m <= high-1; m++) {
|
||||
|
||||
// Scale column.
|
||||
|
||||
Real scale = 0.0;
|
||||
for (int i = m; i <= high; i++) {
|
||||
scale = scale + std::abs(H(i, m-1));
|
||||
}
|
||||
if (scale != 0.0) {
|
||||
|
||||
// Compute Householder transformation.
|
||||
|
||||
Real h = 0.0;
|
||||
for (int i = high; i >= m; i--) {
|
||||
ort_[i] = H(i, m-1)/scale;
|
||||
h += ort_[i] * ort_[i];
|
||||
}
|
||||
Real g = std::sqrt(h);
|
||||
if (ort_[m] > 0) {
|
||||
g = -g;
|
||||
}
|
||||
h = h - ort_[m] * g;
|
||||
ort_[m] = ort_[m] - g;
|
||||
|
||||
// Apply Householder similarity transformation
|
||||
// H = (I-u*u'/h)*H*(I-u*u')/h)
|
||||
|
||||
for (int j = m; j < n_; j++) {
|
||||
Real f = 0.0;
|
||||
for (int i = high; i >= m; i--) {
|
||||
f += ort_[i]*H(i, j);
|
||||
}
|
||||
f = f/h;
|
||||
for (int i = m; i <= high; i++) {
|
||||
H(i, j) -= f*ort_[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i <= high; i++) {
|
||||
Real f = 0.0;
|
||||
for (int j = high; j >= m; j--) {
|
||||
f += ort_[j]*H(i, j);
|
||||
}
|
||||
f = f/h;
|
||||
for (int j = m; j <= high; j++) {
|
||||
H(i, j) -= f*ort_[j];
|
||||
}
|
||||
}
|
||||
ort_[m] = scale*ort_[m];
|
||||
H(m, m-1) = scale*g;
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate transformations (Algol's ortran).
|
||||
|
||||
for (int i = 0; i < n_; i++) {
|
||||
for (int j = 0; j < n_; j++) {
|
||||
V(i, j) = (i == j ? 1.0 : 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
for (int m = high-1; m >= low+1; m--) {
|
||||
if (H(m, m-1) != 0.0) {
|
||||
for (int i = m+1; i <= high; i++) {
|
||||
ort_[i] = H(i, m-1);
|
||||
}
|
||||
for (int j = m; j <= high; j++) {
|
||||
Real g = 0.0;
|
||||
for (int i = m; i <= high; i++) {
|
||||
g += ort_[i] * V(i, j);
|
||||
}
|
||||
// Double division avoids possible underflow
|
||||
g = (g / ort_[m]) / H(m, m-1);
|
||||
for (int i = m; i <= high; i++) {
|
||||
V(i, j) += g * ort_[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void EigenvalueDecomposition<Real>::Hqr2() {
|
||||
// This is derived from the Algol procedure hqr2,
|
||||
// by Martin and Wilkinson, Handbook for Auto. Comp.,
|
||||
// Vol.ii-Linear Algebra, and the corresponding
|
||||
// Fortran subroutine in EISPACK.
|
||||
|
||||
int nn = n_;
|
||||
int n = nn-1;
|
||||
int low = 0;
|
||||
int high = nn-1;
|
||||
Real eps = std::numeric_limits<Real>::epsilon();
|
||||
Real exshift = 0.0;
|
||||
Real p = 0, q = 0, r = 0, s = 0, z=0, t, w, x, y;
|
||||
|
||||
// Store roots isolated by balanc and compute matrix norm
|
||||
|
||||
Real norm = 0.0;
|
||||
for (int i = 0; i < nn; i++) {
|
||||
if (i < low || i > high) {
|
||||
d_[i] = H(i, i);
|
||||
e_[i] = 0.0;
|
||||
}
|
||||
for (int j = std::max(i-1, 0); j < nn; j++) {
|
||||
norm = norm + std::abs(H(i, j));
|
||||
}
|
||||
}
|
||||
|
||||
// Outer loop over eigenvalue index
|
||||
|
||||
int iter = 0;
|
||||
while (n >= low) {
|
||||
|
||||
// Look for single small sub-diagonal element
|
||||
|
||||
int l = n;
|
||||
while (l > low) {
|
||||
s = std::abs(H(l-1, l-1)) + std::abs(H(l, l));
|
||||
if (s == 0.0) {
|
||||
s = norm;
|
||||
}
|
||||
if (std::abs(H(l, l-1)) < eps * s) {
|
||||
break;
|
||||
}
|
||||
l--;
|
||||
}
|
||||
|
||||
// Check for convergence
|
||||
// One root found
|
||||
|
||||
if (l == n) {
|
||||
H(n, n) = H(n, n) + exshift;
|
||||
d_[n] = H(n, n);
|
||||
e_[n] = 0.0;
|
||||
n--;
|
||||
iter = 0;
|
||||
|
||||
// Two roots found
|
||||
|
||||
} else if (l == n-1) {
|
||||
w = H(n, n-1) * H(n-1, n);
|
||||
p = (H(n-1, n-1) - H(n, n)) / 2.0;
|
||||
q = p * p + w;
|
||||
z = std::sqrt(std::abs(q));
|
||||
H(n, n) = H(n, n) + exshift;
|
||||
H(n-1, n-1) = H(n-1, n-1) + exshift;
|
||||
x = H(n, n);
|
||||
|
||||
// Real pair
|
||||
|
||||
if (q >= 0) {
|
||||
if (p >= 0) {
|
||||
z = p + z;
|
||||
} else {
|
||||
z = p - z;
|
||||
}
|
||||
d_[n-1] = x + z;
|
||||
d_[n] = d_[n-1];
|
||||
if (z != 0.0) {
|
||||
d_[n] = x - w / z;
|
||||
}
|
||||
e_[n-1] = 0.0;
|
||||
e_[n] = 0.0;
|
||||
x = H(n, n-1);
|
||||
s = std::abs(x) + std::abs(z);
|
||||
p = x / s;
|
||||
q = z / s;
|
||||
r = std::sqrt(p * p+q * q);
|
||||
p = p / r;
|
||||
q = q / r;
|
||||
|
||||
// Row modification
|
||||
|
||||
for (int j = n-1; j < nn; j++) {
|
||||
z = H(n-1, j);
|
||||
H(n-1, j) = q * z + p * H(n, j);
|
||||
H(n, j) = q * H(n, j) - p * z;
|
||||
}
|
||||
|
||||
// Column modification
|
||||
|
||||
for (int i = 0; i <= n; i++) {
|
||||
z = H(i, n-1);
|
||||
H(i, n-1) = q * z + p * H(i, n);
|
||||
H(i, n) = q * H(i, n) - p * z;
|
||||
}
|
||||
|
||||
// Accumulate transformations
|
||||
|
||||
for (int i = low; i <= high; i++) {
|
||||
z = V(i, n-1);
|
||||
V(i, n-1) = q * z + p * V(i, n);
|
||||
V(i, n) = q * V(i, n) - p * z;
|
||||
}
|
||||
|
||||
// Complex pair
|
||||
|
||||
} else {
|
||||
d_[n-1] = x + p;
|
||||
d_[n] = x + p;
|
||||
e_[n-1] = z;
|
||||
e_[n] = -z;
|
||||
}
|
||||
n = n - 2;
|
||||
iter = 0;
|
||||
|
||||
// No convergence yet
|
||||
|
||||
} else {
|
||||
|
||||
// Form shift
|
||||
|
||||
x = H(n, n);
|
||||
y = 0.0;
|
||||
w = 0.0;
|
||||
if (l < n) {
|
||||
y = H(n-1, n-1);
|
||||
w = H(n, n-1) * H(n-1, n);
|
||||
}
|
||||
|
||||
// Wilkinson's original ad hoc shift
|
||||
|
||||
if (iter == 10) {
|
||||
exshift += x;
|
||||
for (int i = low; i <= n; i++) {
|
||||
H(i, i) -= x;
|
||||
}
|
||||
s = std::abs(H(n, n-1)) + std::abs(H(n-1, n-2));
|
||||
x = y = 0.75 * s;
|
||||
w = -0.4375 * s * s;
|
||||
}
|
||||
|
||||
// MATLAB's new ad hoc shift
|
||||
|
||||
if (iter == 30) {
|
||||
s = (y - x) / 2.0;
|
||||
s = s * s + w;
|
||||
if (s > 0) {
|
||||
s = std::sqrt(s);
|
||||
if (y < x) {
|
||||
s = -s;
|
||||
}
|
||||
s = x - w / ((y - x) / 2.0 + s);
|
||||
for (int i = low; i <= n; i++) {
|
||||
H(i, i) -= s;
|
||||
}
|
||||
exshift += s;
|
||||
x = y = w = 0.964;
|
||||
}
|
||||
}
|
||||
|
||||
iter = iter + 1; // (Could check iteration count here.)
|
||||
|
||||
// Look for two consecutive small sub-diagonal elements
|
||||
|
||||
int m = n-2;
|
||||
while (m >= l) {
|
||||
z = H(m, m);
|
||||
r = x - z;
|
||||
s = y - z;
|
||||
p = (r * s - w) / H(m+1, m) + H(m, m+1);
|
||||
q = H(m+1, m+1) - z - r - s;
|
||||
r = H(m+2, m+1);
|
||||
s = std::abs(p) + std::abs(q) + std::abs(r);
|
||||
p = p / s;
|
||||
q = q / s;
|
||||
r = r / s;
|
||||
if (m == l) {
|
||||
break;
|
||||
}
|
||||
if (std::abs(H(m, m-1)) * (std::abs(q) + std::abs(r)) <
|
||||
eps * (std::abs(p) * (std::abs(H(m-1, m-1)) + std::abs(z) +
|
||||
std::abs(H(m+1, m+1))))) {
|
||||
break;
|
||||
}
|
||||
m--;
|
||||
}
|
||||
|
||||
for (int i = m+2; i <= n; i++) {
|
||||
H(i, i-2) = 0.0;
|
||||
if (i > m+2) {
|
||||
H(i, i-3) = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// Double QR step involving rows l:n and columns m:n
|
||||
|
||||
for (int k = m; k <= n-1; k++) {
|
||||
bool notlast = (k != n-1);
|
||||
if (k != m) {
|
||||
p = H(k, k-1);
|
||||
q = H(k+1, k-1);
|
||||
r = (notlast ? H(k+2, k-1) : 0.0);
|
||||
x = std::abs(p) + std::abs(q) + std::abs(r);
|
||||
if (x != 0.0) {
|
||||
p = p / x;
|
||||
q = q / x;
|
||||
r = r / x;
|
||||
}
|
||||
}
|
||||
if (x == 0.0) {
|
||||
break;
|
||||
}
|
||||
s = std::sqrt(p * p + q * q + r * r);
|
||||
if (p < 0) {
|
||||
s = -s;
|
||||
}
|
||||
if (s != 0) {
|
||||
if (k != m) {
|
||||
H(k, k-1) = -s * x;
|
||||
} else if (l != m) {
|
||||
H(k, k-1) = -H(k, k-1);
|
||||
}
|
||||
p = p + s;
|
||||
x = p / s;
|
||||
y = q / s;
|
||||
z = r / s;
|
||||
q = q / p;
|
||||
r = r / p;
|
||||
|
||||
// Row modification
|
||||
|
||||
for (int j = k; j < nn; j++) {
|
||||
p = H(k, j) + q * H(k+1, j);
|
||||
if (notlast) {
|
||||
p = p + r * H(k+2, j);
|
||||
H(k+2, j) = H(k+2, j) - p * z;
|
||||
}
|
||||
H(k, j) = H(k, j) - p * x;
|
||||
H(k+1, j) = H(k+1, j) - p * y;
|
||||
}
|
||||
|
||||
// Column modification
|
||||
|
||||
for (int i = 0; i <= std::min(n, k+3); i++) {
|
||||
p = x * H(i, k) + y * H(i, k+1);
|
||||
if (notlast) {
|
||||
p = p + z * H(i, k+2);
|
||||
H(i, k+2) = H(i, k+2) - p * r;
|
||||
}
|
||||
H(i, k) = H(i, k) - p;
|
||||
H(i, k+1) = H(i, k+1) - p * q;
|
||||
}
|
||||
|
||||
// Accumulate transformations
|
||||
|
||||
for (int i = low; i <= high; i++) {
|
||||
p = x * V(i, k) + y * V(i, k+1);
|
||||
if (notlast) {
|
||||
p = p + z * V(i, k+2);
|
||||
V(i, k+2) = V(i, k+2) - p * r;
|
||||
}
|
||||
V(i, k) = V(i, k) - p;
|
||||
V(i, k+1) = V(i, k+1) - p * q;
|
||||
}
|
||||
} // (s != 0)
|
||||
} // k loop
|
||||
} // check convergence
|
||||
} // while (n >= low)
|
||||
|
||||
// Backsubstitute to find vectors of upper triangular form
|
||||
|
||||
if (norm == 0.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (n = nn-1; n >= 0; n--) {
|
||||
p = d_[n];
|
||||
q = e_[n];
|
||||
|
||||
// Real vector
|
||||
|
||||
if (q == 0) {
|
||||
int l = n;
|
||||
H(n, n) = 1.0;
|
||||
for (int i = n-1; i >= 0; i--) {
|
||||
w = H(i, i) - p;
|
||||
r = 0.0;
|
||||
for (int j = l; j <= n; j++) {
|
||||
r = r + H(i, j) * H(j, n);
|
||||
}
|
||||
if (e_[i] < 0.0) {
|
||||
z = w;
|
||||
s = r;
|
||||
} else {
|
||||
l = i;
|
||||
if (e_[i] == 0.0) {
|
||||
if (w != 0.0) {
|
||||
H(i, n) = -r / w;
|
||||
} else {
|
||||
H(i, n) = -r / (eps * norm);
|
||||
}
|
||||
|
||||
// Solve real equations
|
||||
|
||||
} else {
|
||||
x = H(i, i+1);
|
||||
y = H(i+1, i);
|
||||
q = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i];
|
||||
t = (x * s - z * r) / q;
|
||||
H(i, n) = t;
|
||||
if (std::abs(x) > std::abs(z)) {
|
||||
H(i+1, n) = (-r - w * t) / x;
|
||||
} else {
|
||||
H(i+1, n) = (-s - y * t) / z;
|
||||
}
|
||||
}
|
||||
|
||||
// Overflow control
|
||||
|
||||
t = std::abs(H(i, n));
|
||||
if ((eps * t) * t > 1) {
|
||||
for (int j = i; j <= n; j++) {
|
||||
H(j, n) = H(j, n) / t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Complex vector
|
||||
|
||||
} else if (q < 0) {
|
||||
int l = n-1;
|
||||
|
||||
// Last vector component imaginary so matrix is triangular
|
||||
|
||||
if (std::abs(H(n, n-1)) > std::abs(H(n-1, n))) {
|
||||
H(n-1, n-1) = q / H(n, n-1);
|
||||
H(n-1, n) = -(H(n, n) - p) / H(n, n-1);
|
||||
} else {
|
||||
Real cdivr, cdivi;
|
||||
cdiv(0.0, -H(n-1, n), H(n-1, n-1)-p, q, &cdivr, &cdivi);
|
||||
H(n-1, n-1) = cdivr;
|
||||
H(n-1, n) = cdivi;
|
||||
}
|
||||
H(n, n-1) = 0.0;
|
||||
H(n, n) = 1.0;
|
||||
for (int i = n-2; i >= 0; i--) {
|
||||
Real ra, sa, vr, vi;
|
||||
ra = 0.0;
|
||||
sa = 0.0;
|
||||
for (int j = l; j <= n; j++) {
|
||||
ra = ra + H(i, j) * H(j, n-1);
|
||||
sa = sa + H(i, j) * H(j, n);
|
||||
}
|
||||
w = H(i, i) - p;
|
||||
|
||||
if (e_[i] < 0.0) {
|
||||
z = w;
|
||||
r = ra;
|
||||
s = sa;
|
||||
} else {
|
||||
l = i;
|
||||
if (e_[i] == 0) {
|
||||
Real cdivr, cdivi;
|
||||
cdiv(-ra, -sa, w, q, &cdivr, &cdivi);
|
||||
H(i, n-1) = cdivr;
|
||||
H(i, n) = cdivi;
|
||||
} else {
|
||||
Real cdivr, cdivi;
|
||||
// Solve complex equations
|
||||
|
||||
x = H(i, i+1);
|
||||
y = H(i+1, i);
|
||||
vr = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i] - q * q;
|
||||
vi = (d_[i] - p) * 2.0 * q;
|
||||
if (vr == 0.0 && vi == 0.0) {
|
||||
vr = eps * norm * (std::abs(w) + std::abs(q) +
|
||||
std::abs(x) + std::abs(y) + std::abs(z));
|
||||
}
|
||||
cdiv(x*r-z*ra+q*sa, x*s-z*sa-q*ra, vr, vi, &cdivr, &cdivi);
|
||||
H(i, n-1) = cdivr;
|
||||
H(i, n) = cdivi;
|
||||
if (std::abs(x) > (std::abs(z) + std::abs(q))) {
|
||||
H(i+1, n-1) = (-ra - w * H(i, n-1) + q * H(i, n)) / x;
|
||||
H(i+1, n) = (-sa - w * H(i, n) - q * H(i, n-1)) / x;
|
||||
} else {
|
||||
cdiv(-r-y*H(i, n-1), -s-y*H(i, n), z, q, &cdivr, &cdivi);
|
||||
H(i+1, n-1) = cdivr;
|
||||
H(i+1, n) = cdivi;
|
||||
}
|
||||
}
|
||||
|
||||
// Overflow control
|
||||
|
||||
t = std::max(std::abs(H(i, n-1)), std::abs(H(i, n)));
|
||||
if ((eps * t) * t > 1) {
|
||||
for (int j = i; j <= n; j++) {
|
||||
H(j, n-1) = H(j, n-1) / t;
|
||||
H(j, n) = H(j, n) / t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Vectors of isolated roots
|
||||
|
||||
for (int i = 0; i < nn; i++) {
|
||||
if (i < low || i > high) {
|
||||
for (int j = i; j < nn; j++) {
|
||||
V(i, j) = H(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Back transformation to get eigenvectors of original matrix
|
||||
|
||||
for (int j = nn-1; j >= low; j--) {
|
||||
for (int i = low; i <= high; i++) {
|
||||
z = 0.0;
|
||||
for (int k = low; k <= std::min(j, high); k++) {
|
||||
z = z + V(i, k) * H(k, j);
|
||||
}
|
||||
V(i, j) = z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
|
||||
KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
|
||||
n_ = A.NumRows();
|
||||
V_ = new Real[n_*n_];
|
||||
d_ = new Real[n_];
|
||||
e_ = new Real[n_];
|
||||
H_ = NULL;
|
||||
ort_ = NULL;
|
||||
if (A.IsSymmetric(0.0)) {
|
||||
|
||||
for (int i = 0; i < n_; i++)
|
||||
for (int j = 0; j < n_; j++)
|
||||
V(i, j) = A(i, j); // Note that V(i, j) is a member function; A(i, j) is an operator
|
||||
// of the matrix A.
|
||||
// Tridiagonalize.
|
||||
Tred2();
|
||||
|
||||
// Diagonalize.
|
||||
Tql2();
|
||||
} else {
|
||||
H_ = new Real[n_*n_];
|
||||
ort_ = new Real[n_];
|
||||
for (int i = 0; i < n_; i++)
|
||||
for (int j = 0; j < n_; j++)
|
||||
H(i, j) = A(i, j); // as before: H is member function, A(i, j) is operator of matrix.
|
||||
|
||||
// Reduce to Hessenberg form.
|
||||
Orthes();
|
||||
|
||||
// Reduce Hessenberg to real Schur form.
|
||||
Hqr2();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
|
||||
delete [] d_;
|
||||
delete [] e_;
|
||||
delete [] V_;
|
||||
delete [] H_;
|
||||
delete [] ort_;
|
||||
}
|
||||
|
||||
// see function MatrixBase<Real>::Eig in kaldi-matrix.cc
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif // KALDI_MATRIX_JAMA_EIG_H_
|
@ -1,531 +0,0 @@
|
||||
// matrix/jama-svd.h
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This file consists of a port and modification of materials from
|
||||
// JAMA: A Java Matrix Package
|
||||
// under the following notice: This software is a cooperative product of
|
||||
// The MathWorks and the National Institute of Standards and Technology (NIST)
|
||||
// which has been released to the public. This notice and the original code are
|
||||
// available at http://math.nist.gov/javanumerics/jama/domain.notice
|
||||
|
||||
|
||||
#ifndef KALDI_MATRIX_JAMA_SVD_H_
|
||||
#define KALDI_MATRIX_JAMA_SVD_H_ 1
|
||||
|
||||
|
||||
#include "matrix/kaldi-matrix.h"
|
||||
#include "matrix/sp-matrix.h"
|
||||
#include "matrix/cblas-wrappers.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
#if defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
|
||||
// using ATLAS as our math library, which doesn't have SVD -> need
|
||||
// to implement it.
|
||||
|
||||
// This routine is a modified form of jama_svd.h which is part of the TNT distribution.
|
||||
// (originally comes from JAMA).
|
||||
|
||||
/** Singular Value Decomposition.
|
||||
* <P>
|
||||
* For an m-by-n matrix A with m >= n, the singular value decomposition is
|
||||
* an m-by-n orthogonal matrix U, an n-by-n diagonal matrix S, and
|
||||
* an n-by-n orthogonal matrix V so that A = U*S*V'.
|
||||
* <P>
|
||||
* The singular values, sigma[k] = S(k, k), are ordered so that
|
||||
* sigma[0] >= sigma[1] >= ... >= sigma[n-1].
|
||||
* <P>
|
||||
* The singular value decompostion always exists, so the constructor will
|
||||
* never fail. The matrix condition number and the effective numerical
|
||||
* rank can be computed from this decomposition.
|
||||
|
||||
* <p>
|
||||
* (Adapted from JAMA, a Java Matrix Library, developed by jointly
|
||||
* by the Mathworks and NIST; see http://math.nist.gov/javanumerics/jama).
|
||||
*/
|
||||
|
||||
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
|
||||
MatrixBase<Real> *U_in,
|
||||
MatrixBase<Real> *V_in) { // Destructive!
|
||||
KALDI_ASSERT(s_in != NULL && U_in != this && V_in != this);
|
||||
int wantu = (U_in != NULL), wantv = (V_in != NULL);
|
||||
Matrix<Real> Utmp, Vtmp;
|
||||
MatrixBase<Real> &U = (U_in ? *U_in : Utmp), &V = (V_in ? *V_in : Vtmp);
|
||||
VectorBase<Real> &s = *s_in;
|
||||
|
||||
int m = num_rows_, n = num_cols_;
|
||||
KALDI_ASSERT(m>=n && m != 0 && n != 0);
|
||||
if (wantu) KALDI_ASSERT((int)U.num_rows_ == m && (int)U.num_cols_ == n);
|
||||
if (wantv) KALDI_ASSERT((int)V.num_rows_ == n && (int)V.num_cols_ == n);
|
||||
KALDI_ASSERT((int)s.Dim() == n); // n<=m so n is min.
|
||||
|
||||
int nu = n;
|
||||
U.SetZero(); // make sure all zero.
|
||||
Vector<Real> e(n);
|
||||
Vector<Real> work(m);
|
||||
MatrixBase<Real> &A(*this);
|
||||
Real *adata = A.Data(), *workdata = work.Data(), *edata = e.Data(),
|
||||
*udata = U.Data(), *vdata = V.Data();
|
||||
int astride = static_cast<int>(A.Stride()),
|
||||
ustride = static_cast<int>(U.Stride()),
|
||||
vstride = static_cast<int>(V.Stride());
|
||||
int i = 0, j = 0, k = 0;
|
||||
|
||||
// Reduce A to bidiagonal form, storing the diagonal elements
|
||||
// in s and the super-diagonal elements in e.
|
||||
|
||||
int nct = std::min(m-1, n);
|
||||
int nrt = std::max(0, std::min(n-2, m));
|
||||
for (k = 0; k < std::max(nct, nrt); k++) {
|
||||
if (k < nct) {
|
||||
|
||||
// Compute the transformation for the k-th column and
|
||||
// place the k-th diagonal in s(k).
|
||||
// Compute 2-norm of k-th column without under/overflow.
|
||||
s(k) = 0;
|
||||
for (i = k; i < m; i++) {
|
||||
s(k) = hypot(s(k), A(i, k));
|
||||
}
|
||||
if (s(k) != 0.0) {
|
||||
if (A(k, k) < 0.0) {
|
||||
s(k) = -s(k);
|
||||
}
|
||||
for (i = k; i < m; i++) {
|
||||
A(i, k) /= s(k);
|
||||
}
|
||||
A(k, k) += 1.0;
|
||||
}
|
||||
s(k) = -s(k);
|
||||
}
|
||||
for (j = k+1; j < n; j++) {
|
||||
if ((k < nct) && (s(k) != 0.0)) {
|
||||
|
||||
// Apply the transformation.
|
||||
|
||||
Real t = cblas_Xdot(m - k, adata + astride*k + k, astride,
|
||||
adata + astride*k + j, astride);
|
||||
/*for (i = k; i < m; i++) {
|
||||
t += adata[i*astride + k]*adata[i*astride + j]; // A(i, k)*A(i, j); // 3
|
||||
}*/
|
||||
t = -t/A(k, k);
|
||||
cblas_Xaxpy(m - k, t, adata + k*astride + k, astride,
|
||||
adata + k*astride + j, astride);
|
||||
/*for (i = k; i < m; i++) {
|
||||
adata[i*astride + j] += t*adata[i*astride + k]; // A(i, j) += t*A(i, k); // 5
|
||||
}*/
|
||||
}
|
||||
|
||||
// Place the k-th row of A into e for the
|
||||
// subsequent calculation of the row transformation.
|
||||
|
||||
e(j) = A(k, j);
|
||||
}
|
||||
if (wantu & (k < nct)) {
|
||||
|
||||
// Place the transformation in U for subsequent back
|
||||
// multiplication.
|
||||
|
||||
for (i = k; i < m; i++) {
|
||||
U(i, k) = A(i, k);
|
||||
}
|
||||
}
|
||||
if (k < nrt) {
|
||||
|
||||
// Compute the k-th row transformation and place the
|
||||
// k-th super-diagonal in e(k).
|
||||
// Compute 2-norm without under/overflow.
|
||||
e(k) = 0;
|
||||
for (i = k+1; i < n; i++) {
|
||||
e(k) = hypot(e(k), e(i));
|
||||
}
|
||||
if (e(k) != 0.0) {
|
||||
if (e(k+1) < 0.0) {
|
||||
e(k) = -e(k);
|
||||
}
|
||||
for (i = k+1; i < n; i++) {
|
||||
e(i) /= e(k);
|
||||
}
|
||||
e(k+1) += 1.0;
|
||||
}
|
||||
e(k) = -e(k);
|
||||
if ((k+1 < m) & (e(k) != 0.0)) {
|
||||
|
||||
// Apply the transformation.
|
||||
|
||||
for (i = k+1; i < m; i++) {
|
||||
work(i) = 0.0;
|
||||
}
|
||||
for (j = k+1; j < n; j++) {
|
||||
for (i = k+1; i < m; i++) {
|
||||
workdata[i] += edata[j] * adata[i*astride + j]; // work(i) += e(j)*A(i, j); // 5
|
||||
}
|
||||
}
|
||||
for (j = k+1; j < n; j++) {
|
||||
Real t(-e(j)/e(k+1));
|
||||
cblas_Xaxpy(m - (k+1), t, workdata + (k+1), 1,
|
||||
adata + (k+1)*astride + j, astride);
|
||||
/*
|
||||
for (i = k+1; i < m; i++) {
|
||||
adata[i*astride + j] += t*workdata[i]; // A(i, j) += t*work(i); // 5
|
||||
}*/
|
||||
}
|
||||
}
|
||||
if (wantv) {
|
||||
|
||||
// Place the transformation in V for subsequent
|
||||
// back multiplication.
|
||||
|
||||
for (i = k+1; i < n; i++) {
|
||||
V(i, k) = e(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set up the final bidiagonal matrix or order p.
|
||||
|
||||
int p = std::min(n, m+1);
|
||||
if (nct < n) {
|
||||
s(nct) = A(nct, nct);
|
||||
}
|
||||
if (m < p) {
|
||||
s(p-1) = 0.0;
|
||||
}
|
||||
if (nrt+1 < p) {
|
||||
e(nrt) = A(nrt, p-1);
|
||||
}
|
||||
e(p-1) = 0.0;
|
||||
|
||||
// If required, generate U.
|
||||
|
||||
if (wantu) {
|
||||
for (j = nct; j < nu; j++) {
|
||||
for (i = 0; i < m; i++) {
|
||||
U(i, j) = 0.0;
|
||||
}
|
||||
U(j, j) = 1.0;
|
||||
}
|
||||
for (k = nct-1; k >= 0; k--) {
|
||||
if (s(k) != 0.0) {
|
||||
for (j = k+1; j < nu; j++) {
|
||||
Real t = cblas_Xdot(m - k, udata + k*ustride + k, ustride, udata + k*ustride + j, ustride);
|
||||
//for (i = k; i < m; i++) {
|
||||
// t += udata[i*ustride + k]*udata[i*ustride + j]; // t += U(i, k)*U(i, j); // 8
|
||||
// }
|
||||
t = -t/U(k, k);
|
||||
cblas_Xaxpy(m - k, t, udata + ustride*k + k, ustride,
|
||||
udata + k*ustride + j, ustride);
|
||||
/*for (i = k; i < m; i++) {
|
||||
udata[i*ustride + j] += t*udata[i*ustride + k]; // U(i, j) += t*U(i, k); // 4
|
||||
}*/
|
||||
}
|
||||
for (i = k; i < m; i++ ) {
|
||||
U(i, k) = -U(i, k);
|
||||
}
|
||||
U(k, k) = 1.0 + U(k, k);
|
||||
for (i = 0; i < k-1; i++) {
|
||||
U(i, k) = 0.0;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < m; i++) {
|
||||
U(i, k) = 0.0;
|
||||
}
|
||||
U(k, k) = 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If required, generate V.
|
||||
|
||||
if (wantv) {
|
||||
for (k = n-1; k >= 0; k--) {
|
||||
if ((k < nrt) & (e(k) != 0.0)) {
|
||||
for (j = k+1; j < nu; j++) {
|
||||
Real t = cblas_Xdot(n - (k+1), vdata + (k+1)*vstride + k, vstride,
|
||||
vdata + (k+1)*vstride + j, vstride);
|
||||
/*Real t (0.0);
|
||||
for (i = k+1; i < n; i++) {
|
||||
t += vdata[i*vstride + k]*vdata[i*vstride + j]; // t += V(i, k)*V(i, j); // 7
|
||||
}*/
|
||||
t = -t/V(k+1, k);
|
||||
cblas_Xaxpy(n - (k+1), t, vdata + (k+1)*vstride + k, vstride,
|
||||
vdata + (k+1)*vstride + j, vstride);
|
||||
/*for (i = k+1; i < n; i++) {
|
||||
vdata[i*vstride + j] += t*vdata[i*vstride + k]; // V(i, j) += t*V(i, k); // 7
|
||||
}*/
|
||||
}
|
||||
}
|
||||
for (i = 0; i < n; i++) {
|
||||
V(i, k) = 0.0;
|
||||
}
|
||||
V(k, k) = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
// Main iteration loop for the singular values.
|
||||
|
||||
int pp = p-1;
|
||||
int iter = 0;
|
||||
// note: -52.0 is from Jama code; the -23 is the extension
|
||||
// to float, because mantissa length in (double, float)
|
||||
// is (52, 23) bits respectively.
|
||||
Real eps(pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
|
||||
// Note: the -966 was taken from Jama code, but the -120 is a guess
|
||||
// of how to extend this to float... the exponent in double goes
|
||||
// from -1022 .. 1023, and in float from -126..127. I'm not sure
|
||||
// what the significance of 966 is, so -120 just represents a number
|
||||
// that's a bit less negative than -126. If we get convergence
|
||||
// failure in float only, this may mean that we have to make the
|
||||
// -120 value less negative.
|
||||
Real tiny(pow(2.0, sizeof(Real) == 4 ? -120.0: -966.0 ));
|
||||
|
||||
while (p > 0) {
|
||||
int k = 0;
|
||||
int kase = 0;
|
||||
|
||||
if (iter == 500 || iter == 750) {
|
||||
KALDI_WARN << "Svd taking a long time: making convergence criterion less exact.";
|
||||
eps = pow(static_cast<Real>(0.8), eps);
|
||||
tiny = pow(static_cast<Real>(0.8), tiny);
|
||||
}
|
||||
if (iter > 1000) {
|
||||
KALDI_WARN << "Svd not converging on matrix of size " << m << " by " <<n;
|
||||
return false;
|
||||
}
|
||||
|
||||
// This section of the program inspects for
|
||||
// negligible elements in the s and e arrays. On
|
||||
// completion the variables kase and k are set as follows.
|
||||
|
||||
// kase = 1 if s(p) and e(k-1) are negligible and k < p
|
||||
// kase = 2 if s(k) is negligible and k < p
|
||||
// kase = 3 if e(k-1) is negligible, k < p, and
|
||||
// s(k), ..., s(p) are not negligible (qr step).
|
||||
// kase = 4 if e(p-1) is negligible (convergence).
|
||||
|
||||
for (k = p-2; k >= -1; k--) {
|
||||
if (k == -1) {
|
||||
break;
|
||||
}
|
||||
if (std::abs(e(k)) <=
|
||||
tiny + eps*(std::abs(s(k)) + std::abs(s(k+1)))) {
|
||||
e(k) = 0.0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (k == p-2) {
|
||||
kase = 4;
|
||||
} else {
|
||||
int ks;
|
||||
for (ks = p-1; ks >= k; ks--) {
|
||||
if (ks == k) {
|
||||
break;
|
||||
}
|
||||
Real t( (ks != p ? std::abs(e(ks)) : 0.) +
|
||||
(ks != k+1 ? std::abs(e(ks-1)) : 0.));
|
||||
if (std::abs(s(ks)) <= tiny + eps*t) {
|
||||
s(ks) = 0.0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ks == k) {
|
||||
kase = 3;
|
||||
} else if (ks == p-1) {
|
||||
kase = 1;
|
||||
} else {
|
||||
kase = 2;
|
||||
k = ks;
|
||||
}
|
||||
}
|
||||
k++;
|
||||
|
||||
// Perform the task indicated by kase.
|
||||
|
||||
switch (kase) {
|
||||
|
||||
// Deflate negligible s(p).
|
||||
|
||||
case 1: {
|
||||
Real f(e(p-2));
|
||||
e(p-2) = 0.0;
|
||||
for (j = p-2; j >= k; j--) {
|
||||
Real t( hypot(s(j), f));
|
||||
Real cs(s(j)/t);
|
||||
Real sn(f/t);
|
||||
s(j) = t;
|
||||
if (j != k) {
|
||||
f = -sn*e(j-1);
|
||||
e(j-1) = cs*e(j-1);
|
||||
}
|
||||
if (wantv) {
|
||||
for (i = 0; i < n; i++) {
|
||||
t = cs*V(i, j) + sn*V(i, p-1);
|
||||
V(i, p-1) = -sn*V(i, j) + cs*V(i, p-1);
|
||||
V(i, j) = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// Split at negligible s(k).
|
||||
|
||||
case 2: {
|
||||
Real f(e(k-1));
|
||||
e(k-1) = 0.0;
|
||||
for (j = k; j < p; j++) {
|
||||
Real t(hypot(s(j), f));
|
||||
Real cs( s(j)/t);
|
||||
Real sn(f/t);
|
||||
s(j) = t;
|
||||
f = -sn*e(j);
|
||||
e(j) = cs*e(j);
|
||||
if (wantu) {
|
||||
for (i = 0; i < m; i++) {
|
||||
t = cs*U(i, j) + sn*U(i, k-1);
|
||||
U(i, k-1) = -sn*U(i, j) + cs*U(i, k-1);
|
||||
U(i, j) = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// Perform one qr step.
|
||||
|
||||
case 3: {
|
||||
|
||||
// Calculate the shift.
|
||||
|
||||
Real scale = std::max(std::max(std::max(std::max(
|
||||
std::abs(s(p-1)), std::abs(s(p-2))), std::abs(e(p-2))),
|
||||
std::abs(s(k))), std::abs(e(k)));
|
||||
Real sp = s(p-1)/scale;
|
||||
Real spm1 = s(p-2)/scale;
|
||||
Real epm1 = e(p-2)/scale;
|
||||
Real sk = s(k)/scale;
|
||||
Real ek = e(k)/scale;
|
||||
Real b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/2.0;
|
||||
Real c = (sp*epm1)*(sp*epm1);
|
||||
Real shift = 0.0;
|
||||
if ((b != 0.0) || (c != 0.0)) {
|
||||
shift = std::sqrt(b*b + c);
|
||||
if (b < 0.0) {
|
||||
shift = -shift;
|
||||
}
|
||||
shift = c/(b + shift);
|
||||
}
|
||||
Real f = (sk + sp)*(sk - sp) + shift;
|
||||
Real g = sk*ek;
|
||||
|
||||
// Chase zeros.
|
||||
|
||||
for (j = k; j < p-1; j++) {
|
||||
Real t = hypot(f, g);
|
||||
Real cs = f/t;
|
||||
Real sn = g/t;
|
||||
if (j != k) {
|
||||
e(j-1) = t;
|
||||
}
|
||||
f = cs*s(j) + sn*e(j);
|
||||
e(j) = cs*e(j) - sn*s(j);
|
||||
g = sn*s(j+1);
|
||||
s(j+1) = cs*s(j+1);
|
||||
if (wantv) {
|
||||
cblas_Xrot(n, vdata + j, vstride, vdata + j+1, vstride, cs, sn);
|
||||
/*for (i = 0; i < n; i++) {
|
||||
t = cs*vdata[i*vstride + j] + sn*vdata[i*vstride + j+1]; // t = cs*V(i, j) + sn*V(i, j+1); // 13
|
||||
vdata[i*vstride + j+1] = -sn*vdata[i*vstride + j] + cs*vdata[i*vstride + j+1]; // V(i, j+1) = -sn*V(i, j) + cs*V(i, j+1); // 5
|
||||
vdata[i*vstride + j] = t; // V(i, j) = t; // 4
|
||||
}*/
|
||||
}
|
||||
t = hypot(f, g);
|
||||
cs = f/t;
|
||||
sn = g/t;
|
||||
s(j) = t;
|
||||
f = cs*e(j) + sn*s(j+1);
|
||||
s(j+1) = -sn*e(j) + cs*s(j+1);
|
||||
g = sn*e(j+1);
|
||||
e(j+1) = cs*e(j+1);
|
||||
if (wantu && (j < m-1)) {
|
||||
cblas_Xrot(m, udata + j, ustride, udata + j+1, ustride, cs, sn);
|
||||
/*for (i = 0; i < m; i++) {
|
||||
t = cs*udata[i*ustride + j] + sn*udata[i*ustride + j+1]; // t = cs*U(i, j) + sn*U(i, j+1); // 7
|
||||
udata[i*ustride + j+1] = -sn*udata[i*ustride + j] +cs*udata[i*ustride + j+1]; // U(i, j+1) = -sn*U(i, j) + cs*U(i, j+1); // 8
|
||||
udata[i*ustride + j] = t; // U(i, j) = t; // 1
|
||||
}*/
|
||||
}
|
||||
}
|
||||
e(p-2) = f;
|
||||
iter = iter + 1;
|
||||
}
|
||||
break;
|
||||
|
||||
// Convergence.
|
||||
|
||||
case 4: {
|
||||
|
||||
// Make the singular values positive.
|
||||
|
||||
if (s(k) <= 0.0) {
|
||||
s(k) = (s(k) < 0.0 ? -s(k) : 0.0);
|
||||
if (wantv) {
|
||||
for (i = 0; i <= pp; i++) {
|
||||
V(i, k) = -V(i, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Order the singular values.
|
||||
|
||||
while (k < pp) {
|
||||
if (s(k) >= s(k+1)) {
|
||||
break;
|
||||
}
|
||||
Real t = s(k);
|
||||
s(k) = s(k+1);
|
||||
s(k+1) = t;
|
||||
if (wantv && (k < n-1)) {
|
||||
for (i = 0; i < n; i++) {
|
||||
t = V(i, k+1); V(i, k+1) = V(i, k); V(i, k) = t;
|
||||
}
|
||||
}
|
||||
if (wantu && (k < m-1)) {
|
||||
for (i = 0; i < m; i++) {
|
||||
t = U(i, k+1); U(i, k+1) = U(i, k); U(i, k) = t;
|
||||
}
|
||||
}
|
||||
k++;
|
||||
}
|
||||
iter = 0;
|
||||
p--;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif // KALDI_MATRIX_JAMA_SVD_H_
|
@ -1,139 +0,0 @@
|
||||
// matrix/kaldi-blas.h
|
||||
|
||||
// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#ifndef KALDI_MATRIX_KALDI_BLAS_H_
|
||||
#define KALDI_MATRIX_KALDI_BLAS_H_
|
||||
|
||||
// This file handles the #includes for BLAS, LAPACK and so on.
|
||||
// It manipulates the declarations into a common format that kaldi can handle.
|
||||
// However, the kaldi code will check whether HAVE_ATLAS is defined as that
|
||||
// code is called a bit differently from CLAPACK that comes from other sources.
|
||||
|
||||
// There are three alternatives:
|
||||
// (i) you have ATLAS, which includes the ATLAS implementation of CBLAS
|
||||
// plus a subset of CLAPACK (but with clapack_ in the function declarations).
|
||||
// In this case, define HAVE_ATLAS and make sure the relevant directories are
|
||||
// in the include path.
|
||||
|
||||
// (ii) you have CBLAS (some implementation thereof) plus CLAPACK.
|
||||
// In this case, define HAVE_CLAPACK.
|
||||
// [Since CLAPACK depends on BLAS, the presence of BLAS is implicit].
|
||||
|
||||
// (iii) you have the MKL library, which includes CLAPACK and CBLAS.
|
||||
|
||||
// Note that if we are using ATLAS, no Svd implementation is supplied,
|
||||
// so we define HAVE_Svd to be zero and this directs our implementation to
|
||||
// supply its own "by hand" implementation which is based on TNT code.
|
||||
|
||||
|
||||
|
||||
#define HAVE_OPENBLAS
|
||||
|
||||
#if (defined(HAVE_CLAPACK) && (defined(HAVE_ATLAS) || defined(HAVE_MKL))) \
|
||||
|| (defined(HAVE_ATLAS) && defined(HAVE_MKL))
|
||||
#error "Do not define more than one of HAVE_CLAPACK, HAVE_ATLAS and HAVE_MKL"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ATLAS
|
||||
extern "C" {
|
||||
#include "cblas.h"
|
||||
#include "clapack.h"
|
||||
}
|
||||
#elif defined(HAVE_CLAPACK)
|
||||
#ifdef __APPLE__
|
||||
#ifndef __has_extension
|
||||
#define __has_extension(x) 0
|
||||
#endif
|
||||
#define vImage_Utilities_h
|
||||
#define vImage_CVUtilities_h
|
||||
#include <Accelerate/Accelerate.h>
|
||||
typedef __CLPK_integer integer;
|
||||
typedef __CLPK_logical logical;
|
||||
typedef __CLPK_real real;
|
||||
typedef __CLPK_doublereal doublereal;
|
||||
typedef __CLPK_complex complex;
|
||||
typedef __CLPK_doublecomplex doublecomplex;
|
||||
typedef __CLPK_ftnlen ftnlen;
|
||||
#else
|
||||
extern "C" {
|
||||
// May be in /usr/[local]/include if installed; else this uses the one
|
||||
// from the tools/CLAPACK_include directory.
|
||||
#include <cblas.h>
|
||||
#include <f2c.h>
|
||||
#include <clapack.h>
|
||||
|
||||
// get rid of macros from f2c.h -- these are dangerous.
|
||||
#undef abs
|
||||
#undef dabs
|
||||
#undef min
|
||||
#undef max
|
||||
#undef dmin
|
||||
#undef dmax
|
||||
#undef bit_test
|
||||
#undef bit_clear
|
||||
#undef bit_set
|
||||
}
|
||||
#endif
|
||||
#elif defined(HAVE_MKL)
|
||||
extern "C" {
|
||||
#include <mkl.h>
|
||||
}
|
||||
#elif defined(HAVE_OPENBLAS)
|
||||
// getting cblas.h and lapacke.h from <openblas-install-dir>/.
|
||||
// putting in "" not <> to search -I before system libraries.
|
||||
#if defined(_MSC_VER)
|
||||
#include <complex.h>
|
||||
#define LAPACK_COMPLEX_CUSTOM
|
||||
#define lapack_complex_float _Fcomplex
|
||||
#define lapack_complex_double _Dcomplex
|
||||
#endif
|
||||
#include "cblas.h"
|
||||
#include "lapacke.h"
|
||||
#undef I
|
||||
#undef complex
|
||||
// get rid of macros from f2c.h -- these are dangerous.
|
||||
#undef abs
|
||||
#undef dabs
|
||||
#undef min
|
||||
#undef max
|
||||
#undef dmin
|
||||
#undef dmax
|
||||
#undef bit_test
|
||||
#undef bit_clear
|
||||
#undef bit_set
|
||||
#else
|
||||
#error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OPENBLAS
|
||||
typedef int KaldiBlasInt; // try int.
|
||||
#endif
|
||||
#ifdef HAVE_CLAPACK
|
||||
typedef integer KaldiBlasInt;
|
||||
#endif
|
||||
#ifdef HAVE_MKL
|
||||
typedef MKL_INT KaldiBlasInt;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ATLAS
|
||||
// in this case there is no need for KaldiBlasInt-- this typedef is only needed
|
||||
// for Svd code which is not included in ATLAS (we re-implement it).
|
||||
#endif
|
||||
|
||||
|
||||
#endif // KALDI_MATRIX_KALDI_BLAS_H_
|
@ -1,612 +0,0 @@
|
||||
// matrix/kaldi-vector.h
|
||||
|
||||
// Copyright 2009-2012 Ondrej Glembek; Microsoft Corporation; Lukas Burget;
|
||||
// Saarland University (Author: Arnab Ghoshal);
|
||||
// Ariya Rastrow; Petr Schwarz; Yanmin Qian;
|
||||
// Karel Vesely; Go Vivace Inc.; Arnab Ghoshal
|
||||
// Wei Shi;
|
||||
// 2015 Guoguo Chen
|
||||
// 2017 Daniel Galvez
|
||||
// 2019 Yiwen Shao
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_MATRIX_KALDI_VECTOR_H_
|
||||
#define KALDI_MATRIX_KALDI_VECTOR_H_ 1
|
||||
|
||||
#include "matrix/matrix-common.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/// \addtogroup matrix_group
|
||||
/// @{
|
||||
|
||||
/// Provides a vector abstraction class.
|
||||
/// This class provides a way to work with vectors in kaldi.
|
||||
/// It encapsulates basic operations and memory optimizations.
|
||||
template<typename Real>
|
||||
class VectorBase {
|
||||
public:
|
||||
/// Set vector to all zeros.
|
||||
void SetZero();
|
||||
|
||||
/// Returns true if matrix is all zeros.
|
||||
bool IsZero(Real cutoff = 1.0e-06) const; // replace magic number
|
||||
|
||||
/// Set all members of a vector to a specified value.
|
||||
void Set(Real f);
|
||||
|
||||
/// Set vector to random normally-distributed noise.
|
||||
void SetRandn();
|
||||
|
||||
/// Sets to numbers uniformly distributed on (0,1)
|
||||
void SetRandUniform();
|
||||
|
||||
/// This function returns a random index into this vector,
|
||||
/// chosen with probability proportional to the corresponding
|
||||
/// element. Requires that this->Min() >= 0 and this->Sum() > 0.
|
||||
MatrixIndexT RandCategorical() const;
|
||||
|
||||
/// Returns the dimension of the vector.
|
||||
inline MatrixIndexT Dim() const { return dim_; }
|
||||
|
||||
/// Returns the size in memory of the vector, in bytes.
|
||||
inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
|
||||
|
||||
/// Returns a pointer to the start of the vector's data.
|
||||
inline Real* Data() { return data_; }
|
||||
|
||||
/// Returns a pointer to the start of the vector's data (const).
|
||||
inline const Real* Data() const { return data_; }
|
||||
|
||||
/// Indexing operator (const).
|
||||
inline Real operator() (MatrixIndexT i) const {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(dim_));
|
||||
return *(data_ + i);
|
||||
}
|
||||
|
||||
/// Indexing operator (non-const).
|
||||
inline Real & operator() (MatrixIndexT i) {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(dim_));
|
||||
return *(data_ + i);
|
||||
}
|
||||
|
||||
/** @brief Returns a sub-vector of a vector (a range of elements).
|
||||
* @param o [in] Origin, 0 < o < Dim()
|
||||
* @param l [in] Length 0 < l < Dim()-o
|
||||
* @return A SubVector object that aliases the data of the Vector object.
|
||||
* See @c SubVector class for details */
|
||||
SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
|
||||
return SubVector<Real>(*this, o, l);
|
||||
}
|
||||
|
||||
/** @brief Returns a const sub-vector of a vector (a range of elements).
|
||||
* @param o [in] Origin, 0 < o < Dim()
|
||||
* @param l [in] Length 0 < l < Dim()-o
|
||||
* @return A SubVector object that aliases the data of the Vector object.
|
||||
* See @c SubVector class for details */
|
||||
const SubVector<Real> Range(const MatrixIndexT o,
|
||||
const MatrixIndexT l) const {
|
||||
return SubVector<Real>(*this, o, l);
|
||||
}
|
||||
|
||||
/// Copy data from another vector (must match own size).
|
||||
void CopyFromVec(const VectorBase<Real> &v);
|
||||
|
||||
/// Copy data from a SpMatrix or TpMatrix (must match own size).
|
||||
template<typename OtherReal>
|
||||
void CopyFromPacked(const PackedMatrix<OtherReal> &M);
|
||||
|
||||
/// Copy data from another vector of different type (double vs. float)
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Copy from CuVector. This is defined in ../cudamatrix/cu-vector.h
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const CuVectorBase<OtherReal> &v);
|
||||
|
||||
/// Applies floor to all elements. Returns number of elements
|
||||
/// floored in floored_count if it is non-null.
|
||||
void Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count = nullptr);
|
||||
|
||||
/// Applies ceiling to all elements. Returns number of elements
|
||||
/// changed in ceiled_count if it is non-null.
|
||||
void Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
|
||||
|
||||
void Pow(const VectorBase<Real> &v, Real power);
|
||||
|
||||
/// Apply natural log to all elements. Throw if any element of
|
||||
/// the vector is negative (but doesn't complain about zero; the
|
||||
/// log will be -infinity
|
||||
void ApplyLog();
|
||||
|
||||
/// Apply natural log to another vector and put result in *this.
|
||||
void ApplyLogAndCopy(const VectorBase<Real> &v);
|
||||
|
||||
/// Apply exponential to each value in vector.
|
||||
void ApplyExp();
|
||||
|
||||
/// Take absolute value of each of the elements
|
||||
void ApplyAbs();
|
||||
|
||||
/// Applies floor to all elements. Returns number of elements
|
||||
/// floored in floored_count if it is non-null.
|
||||
inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) {
|
||||
this->Floor(*this, floor_val, floored_count);
|
||||
};
|
||||
|
||||
/// Applies ceiling to all elements. Returns number of elements
|
||||
/// changed in ceiled_count if it is non-null.
|
||||
inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) {
|
||||
this->Ceiling(*this, ceil_val, ceiled_count);
|
||||
};
|
||||
|
||||
/// Applies floor to all elements. Returns number of elements floored.
|
||||
MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
|
||||
|
||||
/// Apply soft-max to vector and return normalizer (log sum of exponentials).
|
||||
/// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
|
||||
Real ApplySoftMax();
|
||||
|
||||
/// Applies log soft-max to vector and returns normalizer (log sum of
|
||||
/// exponentials).
|
||||
/// This is the same as: \f$ x(i) = x(i) - log(\sum_i exp(x(i))) \f$
|
||||
Real ApplyLogSoftMax();
|
||||
|
||||
/// Sets each element of *this to the tanh of the corresponding element of "src".
|
||||
void Tanh(const VectorBase<Real> &src);
|
||||
|
||||
/// Sets each element of *this to the sigmoid function of the corresponding
|
||||
/// element of "src".
|
||||
void Sigmoid(const VectorBase<Real> &src);
|
||||
|
||||
/// Take all elements of vector to a power.
|
||||
inline void ApplyPow(Real power) {
|
||||
this->Pow(*this, power);
|
||||
};
|
||||
|
||||
/// Take the absolute value of all elements of a vector to a power.
|
||||
/// Include the sign of the input element if include_sign == true.
|
||||
/// If power is negative and the input value is zero, the output is set zero.
|
||||
void ApplyPowAbs(Real power, bool include_sign=false);
|
||||
|
||||
/// Compute the p-th norm of the vector.
|
||||
Real Norm(Real p) const;
|
||||
|
||||
/// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0).
|
||||
bool ApproxEqual(const VectorBase<Real> &other, float tol = 0.01) const;
|
||||
|
||||
/// Invert all elements.
|
||||
void InvertElements();
|
||||
|
||||
/// Add vector : *this = *this + alpha * rv (with casting between floats and
|
||||
/// doubles)
|
||||
template<typename OtherReal>
|
||||
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring].
|
||||
void AddVec2(const Real alpha, const VectorBase<Real> &v);
|
||||
|
||||
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring],
|
||||
/// with casting between floats and doubles.
|
||||
template<typename OtherReal>
|
||||
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Add matrix times vector : this <-- beta*this + alpha*M*v.
|
||||
/// Calls BLAS GEMV.
|
||||
void AddMatVec(const Real alpha, const MatrixBase<Real> &M,
|
||||
const MatrixTransposeType trans, const VectorBase<Real> &v,
|
||||
const Real beta); // **beta previously defaulted to 0.0**
|
||||
|
||||
/// This is as AddMatVec, except optimized for where v contains a lot
|
||||
/// of zeros.
|
||||
void AddMatSvec(const Real alpha, const MatrixBase<Real> &M,
|
||||
const MatrixTransposeType trans, const VectorBase<Real> &v,
|
||||
const Real beta); // **beta previously defaulted to 0.0**
|
||||
|
||||
|
||||
/// Add symmetric positive definite matrix times vector:
|
||||
/// this <-- beta*this + alpha*M*v. Calls BLAS SPMV.
|
||||
void AddSpVec(const Real alpha, const SpMatrix<Real> &M,
|
||||
const VectorBase<Real> &v, const Real beta); // **beta previously defaulted to 0.0**
|
||||
|
||||
/// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
|
||||
/// Works even if rv == *this.
|
||||
void AddTpVec(const Real alpha, const TpMatrix<Real> &M,
|
||||
const MatrixTransposeType trans, const VectorBase<Real> &v,
|
||||
const Real beta); // **beta previously defaulted to 0.0**
|
||||
|
||||
/// Set each element to y = (x == orig ? changed : x).
|
||||
void ReplaceValue(Real orig, Real changed);
|
||||
|
||||
/// Multiply element-by-element by another vector.
|
||||
void MulElements(const VectorBase<Real> &v);
|
||||
/// Multiply element-by-element by another vector of different type.
|
||||
template<typename OtherReal>
|
||||
void MulElements(const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Divide element-by-element by a vector.
|
||||
void DivElements(const VectorBase<Real> &v);
|
||||
/// Divide element-by-element by a vector of different type.
|
||||
template<typename OtherReal>
|
||||
void DivElements(const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Add a constant to each element of a vector.
|
||||
void Add(Real c);
|
||||
|
||||
/// Add element-by-element product of vectors:
|
||||
// this <-- alpha * v .* r + beta*this .
|
||||
void AddVecVec(Real alpha, const VectorBase<Real> &v,
|
||||
const VectorBase<Real> &r, Real beta);
|
||||
|
||||
/// Add element-by-element quotient of two vectors.
|
||||
/// this <---- alpha*v/r + beta*this
|
||||
void AddVecDivVec(Real alpha, const VectorBase<Real> &v,
|
||||
const VectorBase<Real> &r, Real beta);
|
||||
|
||||
/// Multiplies all elements by this constant.
|
||||
void Scale(Real alpha);
|
||||
|
||||
/// Multiplies this vector by lower-triangular matrix: *this <-- *this *M
|
||||
void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
|
||||
|
||||
/// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
|
||||
/// and x is the value of *this at output.
|
||||
/// If trans == kTrans, solves M' x = b.
|
||||
/// Does not test for M being singular or near-singular, so test it before
|
||||
/// calling this routine.
|
||||
void Solve(const TpMatrix<Real> &M, const MatrixTransposeType trans);
|
||||
|
||||
/// Performs a row stack of the matrix M
|
||||
void CopyRowsFromMat(const MatrixBase<Real> &M);
|
||||
template<typename OtherReal>
|
||||
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
|
||||
|
||||
/// The following is implemented in ../cudamatrix/cu-matrix.cc
|
||||
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
|
||||
|
||||
/// Performs a column stack of the matrix M
|
||||
void CopyColsFromMat(const MatrixBase<Real> &M);
|
||||
|
||||
/// Extracts a row of the matrix M. Could also do this with
|
||||
/// this->Copy(M[row]).
|
||||
void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
|
||||
/// Extracts a row of the matrix M with type conversion.
|
||||
template<typename OtherReal>
|
||||
void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
|
||||
|
||||
/// Extracts a row of the symmetric matrix S.
|
||||
template<typename OtherReal>
|
||||
void CopyRowFromSp(const SpMatrix<OtherReal> &S, MatrixIndexT row);
|
||||
|
||||
/// Extracts a column of the matrix M.
|
||||
template<typename OtherReal>
|
||||
void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
|
||||
|
||||
/// Extracts the diagonal of the matrix M.
|
||||
void CopyDiagFromMat(const MatrixBase<Real> &M);
|
||||
|
||||
/// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
|
||||
void CopyDiagFromPacked(const PackedMatrix<Real> &M);
|
||||
|
||||
|
||||
/// Extracts the diagonal of a symmetric matrix.
|
||||
inline void CopyDiagFromSp(const SpMatrix<Real> &M) { CopyDiagFromPacked(M); }
|
||||
|
||||
/// Extracts the diagonal of a triangular matrix.
|
||||
inline void CopyDiagFromTp(const TpMatrix<Real> &M) { CopyDiagFromPacked(M); }
|
||||
|
||||
/// Returns the maximum value of any element, or -infinity for the empty vector.
|
||||
Real Max() const;
|
||||
|
||||
/// Returns the maximum value of any element, and the associated index.
|
||||
/// Error if vector is empty.
|
||||
Real Max(MatrixIndexT *index) const;
|
||||
|
||||
/// Returns the minimum value of any element, or +infinity for the empty vector.
|
||||
Real Min() const;
|
||||
|
||||
/// Returns the minimum value of any element, and the associated index.
|
||||
/// Error if vector is empty.
|
||||
Real Min(MatrixIndexT *index) const;
|
||||
|
||||
/// Returns sum of the elements
|
||||
Real Sum() const;
|
||||
|
||||
/// Returns sum of the logs of the elements. More efficient than
|
||||
/// just taking log of each. Will return NaN if any elements are
|
||||
/// negative.
|
||||
Real SumLog() const;
|
||||
|
||||
/// Does *this = alpha * (sum of rows of M) + beta * *this.
|
||||
void AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
|
||||
|
||||
/// Does *this = alpha * (sum of columns of M) + beta * *this.
|
||||
void AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
|
||||
|
||||
/// Add the diagonal of a matrix times itself:
|
||||
/// *this = diag(M M^T) + beta * *this (if trans == kNoTrans), or
|
||||
/// *this = diag(M^T M) + beta * *this (if trans == kTrans).
|
||||
void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
|
||||
|
||||
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
|
||||
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
|
||||
/// as you would expect.
|
||||
void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
|
||||
const MatrixBase<Real> &N, MatrixTransposeType transN,
|
||||
Real beta = 1.0);
|
||||
|
||||
/// Returns log(sum(exp())) without exp overflow
|
||||
/// If prune > 0.0, ignores terms less than the max - prune.
|
||||
/// [Note: in future, if prune = 0.0, it will take the max.
|
||||
/// For now, use -1 if you don't want it to prune.]
|
||||
Real LogSumExp(Real prune = -1.0) const;
|
||||
|
||||
/// Reads from C++ stream (option to add to existing contents).
|
||||
/// Throws exception on failure
|
||||
void Read(std::istream &in, bool binary, bool add = false);
|
||||
|
||||
/// Writes to C++ stream (option to write in binary).
|
||||
void Write(std::ostream &Out, bool binary) const;
|
||||
|
||||
friend class VectorBase<double>;
|
||||
friend class VectorBase<float>;
|
||||
friend class CuVectorBase<Real>;
|
||||
friend class CuVector<Real>;
|
||||
protected:
|
||||
/// Destructor; does not deallocate memory, this is handled by child classes.
|
||||
/// This destructor is protected so this object can only be
|
||||
/// deleted via a child.
|
||||
~VectorBase() {}
|
||||
|
||||
/// Empty initializer, corresponds to vector of zero size.
|
||||
explicit VectorBase(): data_(NULL), dim_(0) {
|
||||
KALDI_ASSERT_IS_FLOATING_TYPE(Real);
|
||||
}
|
||||
|
||||
// Took this out since it is not currently used, and it is possible to create
|
||||
// objects where the allocated memory is not the same size as dim_ : Arnab
|
||||
// /// Initializer from a pointer and a size; keeps the pointer internally
|
||||
// /// (ownership or non-ownership depends on the child class).
|
||||
// explicit VectorBase(Real* data, MatrixIndexT dim)
|
||||
// : data_(data), dim_(dim) {}
|
||||
|
||||
// Arnab : made this protected since it is unsafe too.
|
||||
/// Load data into the vector: sz must match own size.
|
||||
void CopyFromPtr(const Real* Data, MatrixIndexT sz);
|
||||
|
||||
/// data memory area
|
||||
Real* data_;
|
||||
/// dimension of vector
|
||||
MatrixIndexT dim_;
|
||||
KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
|
||||
}; // class VectorBase
|
||||
|
||||
/** @brief A class representing a vector.
|
||||
*
|
||||
* This class provides a way to work with vectors in kaldi.
|
||||
* It encapsulates basic operations and memory optimizations. */
|
||||
template<typename Real>
|
||||
class Vector: public VectorBase<Real> {
|
||||
public:
|
||||
/// Constructor that takes no arguments. Initializes to empty.
|
||||
Vector(): VectorBase<Real>() {}
|
||||
|
||||
/// Constructor with specific size. Sets to all-zero by default
|
||||
/// if set_zero == false, memory contents are undefined.
|
||||
explicit Vector(const MatrixIndexT s,
|
||||
MatrixResizeType resize_type = kSetZero)
|
||||
: VectorBase<Real>() { Resize(s, resize_type); }
|
||||
|
||||
/// Copy constructor from CUDA vector
|
||||
/// This is defined in ../cudamatrix/cu-vector.h
|
||||
template<typename OtherReal>
|
||||
explicit Vector(const CuVectorBase<OtherReal> &cu);
|
||||
|
||||
/// Copy constructor. The need for this is controversial.
|
||||
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
/// Copy-constructor from base-class, needed to copy from SubVector.
|
||||
explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
/// Type conversion constructor.
|
||||
template<typename OtherReal>
|
||||
explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
// Took this out since it is unsafe : Arnab
|
||||
// /// Constructor from a pointer and a size; copies the data to a location
|
||||
// /// it owns.
|
||||
// Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
|
||||
// Resize(s);
|
||||
// CopyFromPtr(Data, s);
|
||||
// }
|
||||
|
||||
|
||||
/// Swaps the contents of *this and *other. Shallow swap.
|
||||
void Swap(Vector<Real> *other);
|
||||
|
||||
/// Destructor. Deallocates memory.
|
||||
~Vector() { Destroy(); }
|
||||
|
||||
/// Read function using C++ streams. Can also add to existing contents
|
||||
/// of matrix.
|
||||
void Read(std::istream &in, bool binary, bool add = false);
|
||||
|
||||
/// Set vector to a specified size (can be zero).
|
||||
/// The value of the new data depends on resize_type:
|
||||
/// -if kSetZero, the new data will be zero
|
||||
/// -if kUndefined, the new data will be undefined
|
||||
/// -if kCopyData, the new data will be the same as the old data in any
|
||||
/// shared positions, and zero elsewhere.
|
||||
/// This function takes time proportional to the number of data elements.
|
||||
void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
|
||||
|
||||
/// Remove one element and shifts later elements down.
|
||||
void RemoveElement(MatrixIndexT i);
|
||||
|
||||
/// Assignment operator.
|
||||
Vector<Real> &operator = (const Vector<Real> &other) {
|
||||
Resize(other.Dim(), kUndefined);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// Assignment operator that takes VectorBase.
|
||||
Vector<Real> &operator = (const VectorBase<Real> &other) {
|
||||
Resize(other.Dim(), kUndefined);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
/// Init assumes the current contents of the class are invalid (i.e. junk or
|
||||
/// has already been freed), and it sets the vector to newly allocated memory
|
||||
/// with the specified dimension. dim == 0 is acceptable. The memory contents
|
||||
/// pointed to by data_ will be undefined.
|
||||
void Init(const MatrixIndexT dim);
|
||||
|
||||
/// Destroy function, called internally.
|
||||
void Destroy();
|
||||
|
||||
};
|
||||
|
||||
|
||||
/// Represents a non-allocating general vector which can be defined
|
||||
/// as a sub-vector of higher-level vector [or as the row of a matrix].
|
||||
template<typename Real>
|
||||
class SubVector : public VectorBase<Real> {
|
||||
public:
|
||||
/// Constructor from a Vector or SubVector.
|
||||
/// SubVectors are not const-safe and it's very hard to make them
|
||||
/// so for now we just give up. This function contains const_cast.
|
||||
SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
|
||||
const MatrixIndexT length) : VectorBase<Real>() {
|
||||
// following assert equiv to origin>=0 && length>=0 &&
|
||||
// origin+length <= rt.dim_
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
|
||||
static_cast<UnsignedMatrixIndexT>(length) <=
|
||||
static_cast<UnsignedMatrixIndexT>(t.Dim()));
|
||||
VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
|
||||
VectorBase<Real>::dim_ = length;
|
||||
}
|
||||
|
||||
/// This constructor initializes the vector to point at the contents
|
||||
/// of this packed matrix (SpMatrix or TpMatrix).
|
||||
SubVector(const PackedMatrix<Real> &M) {
|
||||
VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
|
||||
VectorBase<Real>::dim_ = (M.NumRows()*(M.NumRows()+1))/2;
|
||||
}
|
||||
|
||||
/// Copy constructor
|
||||
SubVector(const SubVector &other) : VectorBase<Real> () {
|
||||
// this copy constructor needed for Range() to work in base class.
|
||||
VectorBase<Real>::data_ = other.data_;
|
||||
VectorBase<Real>::dim_ = other.dim_;
|
||||
}
|
||||
|
||||
/// Constructor from a pointer to memory and a length. Keeps a pointer
|
||||
/// to the data but does not take ownership (will never delete).
|
||||
/// Caution: this constructor enables you to evade const constraints.
|
||||
SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
|
||||
VectorBase<Real>::data_ = const_cast<Real*>(data);
|
||||
VectorBase<Real>::dim_ = length;
|
||||
}
|
||||
|
||||
/// This operation does not preserve const-ness, so be careful.
|
||||
SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
|
||||
VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
|
||||
VectorBase<Real>::dim_ = matrix.NumCols();
|
||||
}
|
||||
|
||||
~SubVector() {} ///< Destructor (does nothing; no pointers are owned here).
|
||||
|
||||
private:
|
||||
/// Disallow assignment operator.
|
||||
SubVector & operator = (const SubVector &other) {}
|
||||
};
|
||||
|
||||
/// @} end of "addtogroup matrix_group"
|
||||
/// \addtogroup matrix_funcs_io
|
||||
/// @{
|
||||
/// Output to a C++ stream. Non-binary by default (use Write for
|
||||
/// binary output).
|
||||
template<typename Real>
|
||||
std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
|
||||
|
||||
/// Input from a C++ stream. Will automatically read text or
|
||||
/// binary data from the stream.
|
||||
template<typename Real>
|
||||
std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
|
||||
|
||||
/// Input from a C++ stream. Will automatically read text or
|
||||
/// binary data from the stream.
|
||||
template<typename Real>
|
||||
std::istream & operator >> (std::istream & in, Vector<Real> & v);
|
||||
/// @} end of \addtogroup matrix_funcs_io
|
||||
|
||||
/// \addtogroup matrix_funcs_scalar
|
||||
/// @{
|
||||
|
||||
|
||||
template<typename Real>
|
||||
bool ApproxEqual(const VectorBase<Real> &a,
|
||||
const VectorBase<Real> &b, Real tol = 0.01) {
|
||||
return a.ApproxEqual(b, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
|
||||
float tol = 0.01) {
|
||||
KALDI_ASSERT(a.ApproxEqual(b, tol));
|
||||
}
|
||||
|
||||
|
||||
/// Returns dot product between v1 and v2.
|
||||
template<typename Real>
|
||||
Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
|
||||
|
||||
template<typename Real, typename OtherReal>
|
||||
Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
|
||||
|
||||
|
||||
/// Returns \f$ v_1^T M v_2 \f$ .
|
||||
/// Not as efficient as it could be where v1 == v2.
|
||||
template<typename Real>
|
||||
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
|
||||
const VectorBase<Real> &v2);
|
||||
|
||||
/// @} End of "addtogroup matrix_funcs_scalar"
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
// we need to include the implementation
|
||||
#include "matrix/kaldi-vector-inl.h"
|
||||
|
||||
|
||||
|
||||
#endif // KALDI_MATRIX_KALDI_VECTOR_H_
|
@ -1,56 +0,0 @@
|
||||
// matrix/matrix-functions-inl.h
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
//
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// (*) incorporates, with permission, FFT code from his book
|
||||
// "Signal Processing with Lapped Transforms", Artech, 1992.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
|
||||
#define KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
//! ComplexMul implements, inline, the complex multiplication b *= a.
|
||||
template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
|
||||
Real *b_re, Real *b_im) {
|
||||
Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
|
||||
*b_im = *b_re * a_im + *b_im * a_re;
|
||||
*b_re = tmp_re;
|
||||
}
|
||||
|
||||
template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
|
||||
const Real &b_re, const Real &b_im,
|
||||
Real *c_re, Real *c_im) {
|
||||
*c_re += b_re*a_re - b_im*a_im;
|
||||
*c_im += b_re*a_im + b_im*a_re;
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
|
||||
*a_re = std::cos(x);
|
||||
*a_im = std::sin(x);
|
||||
}
|
||||
|
||||
|
||||
} // end namespace kaldi
|
||||
|
||||
|
||||
#endif // KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
|
||||
|
@ -1,773 +0,0 @@
|
||||
// matrix/matrix-functions.cc
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc.; Jan Silovsky
|
||||
// Yanmin Qian; Saarland University; Johns Hopkins University (Author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// (*) incorporates, with permission, FFT code from his book
|
||||
// "Signal Processing with Lapped Transforms", Artech, 1992.
|
||||
|
||||
#include "matrix/matrix-functions.h"
|
||||
#include "matrix/sp-matrix.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real> void ComplexFt (const VectorBase<Real> &in,
|
||||
VectorBase<Real> *out, bool forward) {
|
||||
int exp_sign = (forward ? -1 : 1);
|
||||
KALDI_ASSERT(out != NULL);
|
||||
KALDI_ASSERT(in.Dim() == out->Dim());
|
||||
KALDI_ASSERT(in.Dim() % 2 == 0);
|
||||
int twoN = in.Dim(), N = twoN / 2;
|
||||
const Real *data_in = in.Data();
|
||||
Real *data_out = out->Data();
|
||||
|
||||
Real exp1N_re, exp1N_im; // forward -> exp(-2pi / N), backward -> exp(2pi / N).
|
||||
Real fraction = exp_sign * M_2PI / static_cast<Real>(N); // forward -> -2pi/N, backward->-2pi/N
|
||||
ComplexImExp(fraction, &exp1N_re, &exp1N_im);
|
||||
|
||||
Real expm_re = 1.0, expm_im = 0.0; // forward -> exp(-2pi m / N).
|
||||
|
||||
for (int two_m = 0; two_m < twoN; two_m+=2) { // For each output component.
|
||||
Real expmn_re = 1.0, expmn_im = 0.0; // forward -> exp(-2pi m n / N).
|
||||
Real sum_re = 0.0, sum_im = 0.0; // complex output for index m (the sum expression)
|
||||
for (int two_n = 0; two_n < twoN; two_n+=2) {
|
||||
ComplexAddProduct(data_in[two_n], data_in[two_n+1],
|
||||
expmn_re, expmn_im,
|
||||
&sum_re, &sum_im);
|
||||
ComplexMul(expm_re, expm_im, &expmn_re, &expmn_im);
|
||||
}
|
||||
data_out[two_m] = sum_re;
|
||||
data_out[two_m + 1] = sum_im;
|
||||
|
||||
|
||||
if (two_m % 10 == 0) { // occasionally renew "expm" from scratch to avoid
|
||||
// loss of precision.
|
||||
int nextm = 1 + two_m/2;
|
||||
Real fraction_mult = fraction * nextm;
|
||||
ComplexImExp(fraction_mult, &expm_re, &expm_im);
|
||||
} else {
|
||||
ComplexMul(exp1N_re, exp1N_im, &expm_re, &expm_im);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template
|
||||
void ComplexFt (const VectorBase<float> &in,
|
||||
VectorBase<float> *out, bool forward);
|
||||
template
|
||||
void ComplexFt (const VectorBase<double> &in,
|
||||
VectorBase<double> *out, bool forward);
|
||||
|
||||
|
||||
#define KALDI_COMPLEXFFT_BLOCKSIZE 8192
|
||||
// This #define affects how we recurse in ComplexFftRecursive.
|
||||
// We assume that memory-caching happens on a scale at
|
||||
// least as small as this.
|
||||
|
||||
|
||||
//! ComplexFftRecursive is a recursive function that computes the
|
||||
//! complex FFT of size N. The "nffts" arguments specifies how many
|
||||
//! separate FFTs to compute in parallel (we assume the data for
|
||||
//! each one is consecutive in memory). The "forward argument"
|
||||
//! specifies whether to do the FFT (true) or IFFT (false), although
|
||||
//! note that we do not include the factor of 1/N (the user should
|
||||
//! do this if required. The iterators factor_begin and factor_end
|
||||
//! point to the beginning and end (i.e. one past the last element)
|
||||
//! of an array of small factors of N (typically prime factors).
|
||||
//! See the comments below this code for the detailed equations
|
||||
//! of the recursion.
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void ComplexFftRecursive (Real *data, int nffts, int N,
|
||||
const int *factor_begin,
|
||||
const int *factor_end, bool forward,
|
||||
Vector<Real> *tmp_vec) {
|
||||
if (factor_begin == factor_end) {
|
||||
KALDI_ASSERT(N == 1);
|
||||
return;
|
||||
}
|
||||
|
||||
{ // an optimization: compute in smaller blocks.
|
||||
// this block of code could be removed and it would still work.
|
||||
MatrixIndexT size_perblock = N * 2 * sizeof(Real);
|
||||
if (nffts > 1 && size_perblock*nffts > KALDI_COMPLEXFFT_BLOCKSIZE) { // can break it up...
|
||||
// Break up into multiple blocks. This is an optimization. We make
|
||||
// no progress on the FFT when we do this.
|
||||
int block_skip = KALDI_COMPLEXFFT_BLOCKSIZE / size_perblock; // n blocks per call
|
||||
if (block_skip == 0) block_skip = 1;
|
||||
if (block_skip < nffts) {
|
||||
int blocks_left = nffts;
|
||||
while (blocks_left > 0) {
|
||||
int skip_now = std::min(blocks_left, block_skip);
|
||||
ComplexFftRecursive(data, skip_now, N, factor_begin, factor_end, forward, tmp_vec);
|
||||
blocks_left -= skip_now;
|
||||
data += skip_now * N*2;
|
||||
}
|
||||
return;
|
||||
} // else do the actual algorithm.
|
||||
} // else do the actual algorithm.
|
||||
}
|
||||
|
||||
int P = *factor_begin;
|
||||
KALDI_ASSERT(P > 1);
|
||||
int Q = N / P;
|
||||
|
||||
|
||||
if (P > 1 && Q > 1) { // Do the rearrangement. C.f. eq. (8) below. Transform
|
||||
// (a) to (b).
|
||||
Real *data_thisblock = data;
|
||||
if (tmp_vec->Dim() < (MatrixIndexT)N) tmp_vec->Resize(N);
|
||||
Real *data_tmp = tmp_vec->Data();
|
||||
for (int thisfft = 0; thisfft < nffts; thisfft++, data_thisblock+=N*2) {
|
||||
for (int offset = 0; offset < 2; offset++) { // 0 == real, 1 == im.
|
||||
for (int p = 0; p < P; p++) {
|
||||
for (int q = 0; q < Q; q++) {
|
||||
int aidx = q*P + p, bidx = p*Q + q;
|
||||
data_tmp[bidx] = data_thisblock[2*aidx+offset];
|
||||
}
|
||||
}
|
||||
for (int n = 0;n < P*Q;n++) data_thisblock[2*n+offset] = data_tmp[n];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // Recurse.
|
||||
ComplexFftRecursive(data, nffts*P, Q, factor_begin+1, factor_end, forward, tmp_vec);
|
||||
}
|
||||
|
||||
int exp_sign = (forward ? -1 : 1);
|
||||
Real rootN_re, rootN_im; // Nth root of unity.
|
||||
ComplexImExp(static_cast<Real>(exp_sign * M_2PI / N), &rootN_re, &rootN_im);
|
||||
|
||||
Real rootP_re, rootP_im; // Pth root of unity.
|
||||
ComplexImExp(static_cast<Real>(exp_sign * M_2PI / P), &rootP_re, &rootP_im);
|
||||
|
||||
{ // Do the multiplication
|
||||
// could avoid a bunch of complex multiplies by moving the loop over data_thisblock
|
||||
// inside.
|
||||
if (tmp_vec->Dim() < (MatrixIndexT)(P*2)) tmp_vec->Resize(P*2);
|
||||
Real *temp_a = tmp_vec->Data();
|
||||
|
||||
Real *data_thisblock = data, *data_end = data+(N*2*nffts);
|
||||
for (; data_thisblock != data_end; data_thisblock += N*2) { // for each separate fft.
|
||||
Real qd_re = 1.0, qd_im = 0.0; // 1^(q'/N)
|
||||
for (int qd = 0; qd < Q; qd++) {
|
||||
Real pdQ_qd_re = qd_re, pdQ_qd_im = qd_im; // 1^((p'Q+q') / N) == 1^((p'/P) + (q'/N))
|
||||
// Initialize to q'/N, corresponding to p' == 0.
|
||||
for (int pd = 0; pd < P; pd++) { // pd == p'
|
||||
{ // This is the p = 0 case of the loop below [an optimization].
|
||||
temp_a[pd*2] = data_thisblock[qd*2];
|
||||
temp_a[pd*2 + 1] = data_thisblock[qd*2 + 1];
|
||||
}
|
||||
{ // This is the p = 1 case of the loop below [an optimization]
|
||||
// **** MOST OF THE TIME (>60% I think) gets spent here. ***
|
||||
ComplexAddProduct(pdQ_qd_re, pdQ_qd_im,
|
||||
data_thisblock[(qd+Q)*2], data_thisblock[(qd+Q)*2 + 1],
|
||||
&(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
|
||||
}
|
||||
if (P > 2) {
|
||||
Real p_pdQ_qd_re = pdQ_qd_re, p_pdQ_qd_im = pdQ_qd_im; // 1^(p(p'Q+q')/N)
|
||||
for (int p = 2; p < P; p++) {
|
||||
ComplexMul(pdQ_qd_re, pdQ_qd_im, &p_pdQ_qd_re, &p_pdQ_qd_im); // p_pdQ_qd *= pdQ_qd.
|
||||
int data_idx = p*Q + qd;
|
||||
ComplexAddProduct(p_pdQ_qd_re, p_pdQ_qd_im,
|
||||
data_thisblock[data_idx*2], data_thisblock[data_idx*2 + 1],
|
||||
&(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
|
||||
}
|
||||
}
|
||||
if (pd != P-1)
|
||||
ComplexMul(rootP_re, rootP_im, &pdQ_qd_re, &pdQ_qd_im); // pdQ_qd *= (rootP == 1^{1/P})
|
||||
// (using 1/P == Q/N)
|
||||
}
|
||||
for (int pd = 0; pd < P; pd++) {
|
||||
data_thisblock[(pd*Q + qd)*2] = temp_a[pd*2];
|
||||
data_thisblock[(pd*Q + qd)*2 + 1] = temp_a[pd*2 + 1];
|
||||
}
|
||||
ComplexMul(rootN_re, rootN_im, &qd_re, &qd_im); // qd *= rootN.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Equations for ComplexFftRecursive.
|
||||
We consider here one of the "nffts" separate ffts; it's just a question of
|
||||
doing them all in parallel. We also write all equations in terms of
|
||||
complex math (the conversion to real arithmetic is not hard, and anyway
|
||||
takes place inside function calls).
|
||||
|
||||
|
||||
Let the input (i.e. "data" at start) be a_n, n = 0..N-1, and
|
||||
the output (Fourier transform) be d_k, k = 0..N-1. We use these letters because
|
||||
there will be two intermediate variables b and c.
|
||||
We want to compute:
|
||||
|
||||
d_k = \sum_n a_n 1^(kn/N) (1)
|
||||
|
||||
where we use 1^x as shorthand for exp(-2pi x) for the forward algorithm
|
||||
and exp(2pi x) for the backward one.
|
||||
|
||||
We factorize N = P Q (P small, Q usually large).
|
||||
With p = 0..P-1 and q = 0..Q-1, and also p'=0..P-1 and q'=0..P-1, we let:
|
||||
|
||||
k == p'Q + q' (2)
|
||||
n == qP + p (3)
|
||||
|
||||
That is, we let p, q, p', q' range over these indices and observe that this way we
|
||||
can cover all n, k. Expanding (1) using (2) and (3), we can write:
|
||||
|
||||
d_k = \sum_{p, q} a_n 1^((p'Q+q')(qP+p)/N)
|
||||
= \sum_{p, q} a_n 1^(p'pQ/N) 1^(q'qP/N) 1^(q'p/N) (4)
|
||||
|
||||
using 1^(PQ/N) = 1 to get rid of the terms with PQ in them. Rearranging (4),
|
||||
|
||||
d_k = \sum_p 1^(p'pQ/N) 1^(q'p/N) \sum_q 1^(q'qP/N) a_n (5)
|
||||
|
||||
The point here is to separate the index q. Now we can expand out the remaining
|
||||
instances of k and n using (2) and (3):
|
||||
|
||||
d_(p'Q+q') = \sum_p 1^(p'pQ/N) 1^(q'p/N) \sum_q 1^(q'qP/N) a_(qP+p) (6)
|
||||
|
||||
The expression \sum_q varies with the indices p and q'. Let us define
|
||||
|
||||
C_{p, q'} = \sum_q 1^(q'qP/N) a_(qP+p) (7)
|
||||
|
||||
Here, C_{p, q'}, viewed as a sequence in q', is just the DFT of the points
|
||||
a_(qP+p) for q = 1..Q-1. These points are not consecutive in memory though,
|
||||
they jump by P each time. Let us define b as a rearranged version of a,
|
||||
so that
|
||||
|
||||
b_(pQ+q) = a_(qP+p) (8)
|
||||
|
||||
How to do this rearrangement in place? In
|
||||
|
||||
We can rearrange (7) to be written in terms of the b's, using (8), so that
|
||||
|
||||
C_{p, q'} = \sum_q 1^(q'q (P/N)) b_(pQ+q) (9)
|
||||
|
||||
Here, the sequence of C_{p, q'} over q'=0..Q-1, is just the DFT of the sequence
|
||||
of b_(pQ) .. b_(p(Q+1)-1). Let's arrange the C_{p, q'} in a single array in
|
||||
memory in the same way as the b's, i.e. we define
|
||||
c_(pQ+q') == C_{p, q'}. (10)
|
||||
Note that we could have written (10) with q in place of q', as there is only
|
||||
one index of type q present, but q' is just a more natural variable name to use
|
||||
since we use q' elsewhere to subscript c and C.
|
||||
|
||||
Rewriting (9), we have:
|
||||
c_(pQ+q') = \sum_q 1^(q'q (P/N)) b_(pQ+q) (11)
|
||||
which is the DFT computed by the recursive call to this function [after computing
|
||||
the b's by rearranging the a's]. From the c's we want to compute the d's.
|
||||
Taking (6), substituting in the sum (7), and using (10) to write it as an array,
|
||||
we have:
|
||||
d_(p'Q+q') = \sum_p 1^(p'pQ/N) 1^(q'p/N) c_(pQ+q') (12)
|
||||
This sum is independent for different values of q'. Note that d overwrites c
|
||||
in memory. We compute this in a direct way, using a little array of size P to
|
||||
store the computed d values for one value of q' (we reuse the array for each value
|
||||
of q').
|
||||
|
||||
So the overall picture is this:
|
||||
We get a call to compute DFT on size N.
|
||||
|
||||
- If N == 1 we return (nothing to do).
|
||||
- We factor N = P Q (typically, P is small).
|
||||
- Using (8), we rearrange the data in memory so that we have b not a in memory
|
||||
(this is the block "do the rearrangement").
|
||||
The pseudocode for this is as follows. For simplicity we use a temporary array.
|
||||
|
||||
for p = 0..P-1
|
||||
for q = 0..Q-1
|
||||
bidx = pQ + q
|
||||
aidx = qP + p
|
||||
tmp[bidx] = data[aidx].
|
||||
end
|
||||
end
|
||||
data <-- tmp
|
||||
else
|
||||
|
||||
endif
|
||||
|
||||
|
||||
The reason this accomplishes (8) is that we want pQ+q and qP+p to be swapped
|
||||
over for each p, q, and the "if m > n" is a convenient way of ensuring that
|
||||
this swapping happens only once (otherwise it would happen twice, since pQ+q
|
||||
and qP+p both range over the entire set of numbers 0..N-1).
|
||||
|
||||
- We do the DFT on the smaller block size to compute c from b (this eq eq. (11)).
|
||||
Note that this is actually multiple DFTs, one for each value of p, but this
|
||||
goes to the "nffts" argument of the function call, which we have ignored up to now.
|
||||
|
||||
-We compute eq. (12) via a loop, as follows
|
||||
allocate temporary array e of size P.
|
||||
For q' = 0..Q-1:
|
||||
for p' = 0..P-1:
|
||||
set sum to zero [this will go in e[p']]
|
||||
for p = p..P-1:
|
||||
sum += 1^(p'pQ/N) 1^(q'p/N) c_(pQ+q')
|
||||
end
|
||||
e[p'] = sum
|
||||
end
|
||||
for p' = 0..P-1:
|
||||
d_(p'Q+q') = e[p']
|
||||
end
|
||||
end
|
||||
delete temporary array e
|
||||
|
||||
*/
|
||||
|
||||
// This is the outer-layer calling code for ComplexFftRecursive.
|
||||
// It factorizes the dimension and then calls the FFT routine.
|
||||
template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
|
||||
KALDI_ASSERT(v != NULL);
|
||||
|
||||
if (v->Dim()<=1) return;
|
||||
KALDI_ASSERT(v->Dim() % 2 == 0); // complex input.
|
||||
int N = v->Dim() / 2;
|
||||
std::vector<int> factors;
|
||||
Factorize(N, &factors);
|
||||
int *factor_beg = NULL;
|
||||
if (factors.size() > 0)
|
||||
factor_beg = &(factors[0]);
|
||||
Vector<Real> tmp; // allocated in ComplexFftRecursive.
|
||||
ComplexFftRecursive(v->Data(), 1, N, factor_beg, factor_beg+factors.size(), forward, (tmp_in?tmp_in:&tmp));
|
||||
}
|
||||
|
||||
//! Inefficient version of Fourier transform, for testing purposes.
|
||||
template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
|
||||
KALDI_ASSERT(v != NULL);
|
||||
MatrixIndexT N = v->Dim();
|
||||
KALDI_ASSERT(N%2 == 0);
|
||||
if (N == 0) return;
|
||||
Vector<Real> vtmp(N*2); // store as complex.
|
||||
if (forward) {
|
||||
for (MatrixIndexT i = 0; i < N; i++) vtmp(i*2) = (*v)(i);
|
||||
ComplexFft(&vtmp, forward); // this is already tested so we can use this.
|
||||
v->CopyFromVec( vtmp.Range(0, N) );
|
||||
(*v)(1) = vtmp(N); // Copy the N/2'th fourier component, which is real,
|
||||
// to the imaginary part of the 1st complex output.
|
||||
} else {
|
||||
// reverse the transformation above to get the complex spectrum.
|
||||
vtmp(0) = (*v)(0); // copy F_0 which is real
|
||||
vtmp(N) = (*v)(1); // copy F_{N/2} which is real
|
||||
for (MatrixIndexT i = 1; i < N/2; i++) {
|
||||
// Copy i'th to i'th fourier component
|
||||
vtmp(2*i) = (*v)(2*i);
|
||||
vtmp(2*i+1) = (*v)(2*i+1);
|
||||
// Copy i'th to N-i'th, conjugated.
|
||||
vtmp(2*(N-i)) = (*v)(2*i);
|
||||
vtmp(2*(N-i)+1) = -(*v)(2*i+1);
|
||||
}
|
||||
ComplexFft(&vtmp, forward); // actually backward since forward == false
|
||||
// Copy back real part. Complex part should be zero.
|
||||
for (MatrixIndexT i = 0; i < N; i++)
|
||||
(*v)(i) = vtmp(i*2);
|
||||
}
|
||||
}
|
||||
|
||||
template void RealFftInefficient (VectorBase<float> *v, bool forward);
|
||||
template void RealFftInefficient (VectorBase<double> *v, bool forward);
|
||||
|
||||
template
|
||||
void ComplexFft(VectorBase<float> *v, bool forward, Vector<float> *tmp_in);
|
||||
template
|
||||
void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
|
||||
|
||||
|
||||
// See the long comment below for the math behind this.
|
||||
template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
|
||||
KALDI_ASSERT(v != NULL);
|
||||
MatrixIndexT N = v->Dim(), N2 = N/2;
|
||||
KALDI_ASSERT(N%2 == 0);
|
||||
if (N == 0) return;
|
||||
|
||||
if (forward) ComplexFft(v, true);
|
||||
|
||||
Real *data = v->Data();
|
||||
Real rootN_re, rootN_im; // exp(-2pi/N), forward; exp(2pi/N), backward
|
||||
int forward_sign = forward ? -1 : 1;
|
||||
ComplexImExp(static_cast<Real>(M_2PI/N *forward_sign), &rootN_re, &rootN_im);
|
||||
Real kN_re = -forward_sign, kN_im = 0.0; // exp(-2pik/N), forward; exp(-2pik/N), backward
|
||||
// kN starts out as 1.0 for forward algorithm but -1.0 for backward.
|
||||
for (MatrixIndexT k = 1; 2*k <= N2; k++) {
|
||||
ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im);
|
||||
|
||||
Real Ck_re, Ck_im, Dk_re, Dk_im;
|
||||
// C_k = 1/2 (B_k + B_{N/2 - k}^*) :
|
||||
Ck_re = 0.5 * (data[2*k] + data[N - 2*k]);
|
||||
Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]);
|
||||
// re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})):
|
||||
Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]);
|
||||
// im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))
|
||||
Dk_im =-0.5 * (data[2*k] - data[N - 2*k]);
|
||||
// A_k = C_k + 1^(k/N) D_k:
|
||||
data[2*k] = Ck_re; // A_k <-- C_k
|
||||
data[2*k+1] = Ck_im;
|
||||
// now A_k += D_k 1^(k/N)
|
||||
ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1]));
|
||||
|
||||
MatrixIndexT kdash = N2 - k;
|
||||
if (kdash != k) {
|
||||
// Next we handle the index k' = N/2 - k. This is necessary
|
||||
// to do now, to avoid invalidating data that we will later need.
|
||||
// The quantities C_{k'} and D_{k'} are just the conjugates of C_k
|
||||
// and D_k, so the equations are simple modifications of the above,
|
||||
// replacing Ck_im and Dk_im with their negatives.
|
||||
data[2*kdash] = Ck_re; // A_k' <-- C_k'
|
||||
data[2*kdash+1] = -Ck_im;
|
||||
// now A_k' += D_k' 1^(k'/N)
|
||||
// We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^*
|
||||
// so it's the same as 1^(k/N) but with the real part negated.
|
||||
ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1]));
|
||||
}
|
||||
}
|
||||
|
||||
{ // Now handle k = 0.
|
||||
// In simple terms: after the complex fft, data[0] becomes the sum of real
|
||||
// parts input[0], input[2]... and data[1] becomes the sum of imaginary
|
||||
// pats input[1], input[3]...
|
||||
// "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2]..
|
||||
// and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... .
|
||||
Real zeroth = data[0] + data[1],
|
||||
n2th = data[0] - data[1];
|
||||
data[0] = zeroth;
|
||||
data[1] = n2th;
|
||||
if (!forward) {
|
||||
data[0] /= 2;
|
||||
data[1] /= 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (!forward) {
|
||||
ComplexFft(v, false);
|
||||
v->Scale(2.0); // This is so we get a factor of N increase, rather than N/2 which we would
|
||||
// otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2.
|
||||
// It's for consistency with our normal FFT convensions.
|
||||
}
|
||||
}
|
||||
|
||||
template void RealFft (VectorBase<float> *v, bool forward);
|
||||
template void RealFft (VectorBase<double> *v, bool forward);
|
||||
|
||||
/* Notes for real FFTs.
|
||||
We are using the same convention as above, 1^x to mean exp(-2\pi x) for the forward transform.
|
||||
Actually, in a slight abuse of notation, we use this meaning for 1^x in both the forward and
|
||||
backward cases because it's more convenient in this section.
|
||||
|
||||
Suppose we have real data a[0...N-1], with N even, and want to compute its Fourier transform.
|
||||
We can make do with the first N/2 points of the transform, since the remaining ones are complex
|
||||
conjugates of the first. We want to compute:
|
||||
for k = 0...N/2-1,
|
||||
A_k = \sum_{n = 0}^{N-1} a_n 1^(kn/N) (1)
|
||||
|
||||
We treat a[0..N-1] as a complex sequence of length N/2, i.e. a sequence b[0..N/2 - 1].
|
||||
Viewed as sequences of length N/2, we have:
|
||||
b = c + i d,
|
||||
where c = a_0, a_2 ... and d = a_1, a_3 ...
|
||||
|
||||
We can recover the length-N/2 Fourier transforms of c and d by doing FT on b and
|
||||
then doing the equations below. Derivation is marked by (*) in a comment below (search
|
||||
for it). Let B, C, D be the FTs.
|
||||
We have
|
||||
C_k = 1/2 (B_k + B_{N/2 - k}^*) (z0)
|
||||
D_k =-1/2i (B_k - B_{N/2 - k}^*) (z1)
|
||||
so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})) (z2)
|
||||
im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k})) (z3)
|
||||
|
||||
To recover the FT A from C and D, we write, rearranging (1):
|
||||
|
||||
A_k = \sum_{n = 0, 2, ..., N-2} a_n 1^(kn/N)
|
||||
+\sum_{n = 1, 3, ..., N-1} a_n 1^(kn/N)
|
||||
= \sum_{n = 0, 1, ..., N/2-1} a_n 1^(2kn/N) + a_{n+1} 1^(2kn/N) 1^(k/N)
|
||||
= \sum_{n = 0, 1, ..., N/2-1} c_n 1^(2kn/N) + d_n 1^(2kn/N) 1^(k/N)
|
||||
A_k = C_k + 1^(k/N) D_k (a0)
|
||||
|
||||
This equation is valid for k = 0...N/2-1, which is the range of the sequences B_k and
|
||||
C_k. We don't use is for k = 0, which is a special case considered below. For
|
||||
1 < k < N/2, it's convenient to consider the pair k, k', where k' = N/2 - k.
|
||||
Remember that C_k' = C_k^ *and D_k' = D_k^* [where * is conjugation]. Also,
|
||||
1^(N/2 / N) = -1. So we have:
|
||||
A_k' = C_k^* - 1^(k/N) D_k^* (a0b)
|
||||
We do (a0) and (a0b) together.
|
||||
|
||||
|
||||
|
||||
By symmetry this gives us the Fourier components for N/2+1, ... N, if we want
|
||||
them. However, it doesn't give us the value for exactly k = N/2. For k = 0 and k = N/2, it
|
||||
is easiest to argue directly about the meaning of the A_k, B_k and C_k in terms of
|
||||
sums of points.
|
||||
A_0 and A_{N/2} are both real, with A_0=\sum_n a_n, and A_1 an alternating sum
|
||||
A_1 = a_0 - a_1 + a_2 ...
|
||||
It's easy to show that
|
||||
A_0 = B_0 + C_0 (a1)
|
||||
A_{N/2} = B_0 - C_0. (a2)
|
||||
Since B_0 and C_0 are both real, B_0 is the real coefficient of D_0 and C_0 is the
|
||||
imaginary coefficient.
|
||||
|
||||
*REVERSING THE PROCESS*
|
||||
|
||||
Next we want to reverse this process. We just need to work out C_k and D_k from the
|
||||
sequence A_k. Then we do the inverse complex fft and we get back where we started.
|
||||
For 0 and N/2, working from (a1) and (a2) above, we can see that:
|
||||
B_0 = 1/2 (A_0 + A_{N/2}) (y0)
|
||||
C_0 = 1/2 (A_0 + A_{N/2}) (y1)
|
||||
and we use
|
||||
D_0 = B_0 + i C_0
|
||||
to get the 1st complex coefficient of D. This is exactly the same as the forward process
|
||||
except with an extra factor of 1/2.
|
||||
|
||||
Consider equations (a0) and (a0b). We want to work out C_k and D_k from A_k and A_k'. Remember
|
||||
k' = N/2 - k.
|
||||
|
||||
Write down
|
||||
A_k = C_k + 1^(k/N) D_k (copying a0)
|
||||
A_k'^* = C_k - 1^(k/N) D_k (conjugate of a0b)
|
||||
So
|
||||
C_k = 0.5 (A_k + A_k'^*) (p0)
|
||||
D_k = 1^(-k/N) . 0.5 (A_k - A_k'^*) (p1)
|
||||
Next, we want to compute B_k and B_k' from C_k and D_k. C.f. (z0)..(z3), and remember
|
||||
that k' = N/2-k. We can see
|
||||
that
|
||||
B_k = C_k + i D_k (p2)
|
||||
B_k' = C_k - i D_k (p3)
|
||||
|
||||
We would like to make the equations (p0) ... (p3) look like the forward equations (z0), (z1),
|
||||
(a0) and (a0b) so we can reuse the code. Define E_k = -i 1^(k/N) D_k. Then write down (p0)..(p3).
|
||||
We have
|
||||
C_k = 0.5 (A_k + A_k'^*) (p0')
|
||||
E_k = -0.5 i (A_k - A_k'^*) (p1')
|
||||
B_k = C_k - 1^(-k/N) E_k (p2')
|
||||
B_k' = C_k + 1^(-k/N) E_k (p3')
|
||||
So these are exactly the same as (z0), (z1), (a0), (a0b) except replacing 1^(k/N) with
|
||||
-1^(-k/N) . Remember that we defined 1^x above to be exp(-2pi x/N), so the signs here
|
||||
might be opposite to what you see in the code.
|
||||
|
||||
MODIFICATION: we need to take care of a factor of two. The complex FFT we implemented
|
||||
does not divide by N in the reverse case. So upon inversion we get larger by N/2.
|
||||
However, this is not consistent with normal FFT conventions where you get a factor of N.
|
||||
For this reason we multiply by two after the process described above.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
(*) [this token is referred to in a comment above].
|
||||
|
||||
Notes for separating 2 real transforms from one complex one. Note that the
|
||||
letters here (A, B, C and N) are all distinct from the same letters used in the
|
||||
place where this comment is used.
|
||||
Suppose we
|
||||
have two sequences a_n and b_n, n = 0..N-1. We combine them into a complex
|
||||
number,
|
||||
c_n = a_n + i b_n.
|
||||
Then we take the fourier transform to get
|
||||
C_k = \sum_{n = 0}^{N-1} c_n 1^(n/N) .
|
||||
Then we use symmetry. Define A_k and B_k as the DFTs of a and b.
|
||||
We use A_k = A_{N-k}^*, and B_k = B_{N-k}^*, since a and b are real. Using
|
||||
C_k = A_k + i B_k,
|
||||
C_{N-k} = A_k^* + i B_k^*
|
||||
= A_k^* - (i B_k)^*
|
||||
So:
|
||||
A_k = 1/2 (C_k + C_{N-k}^*)
|
||||
i B_k = 1/2 (C_k - C_{N-k}^*)
|
||||
-> B_k =-1/2i (C_k - C_{N-k}^*)
|
||||
-> re(B_k) = 1/2 (im(C_k) + im(C_{N-k}))
|
||||
im(B_k) =-1/2 (re(C_k) - re(C_{N-k}))
|
||||
|
||||
*/
|
||||
|
||||
template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
|
||||
//KALDI_ASSERT(M->NumRows() == M->NumCols());
|
||||
MatrixIndexT K = M->NumRows();
|
||||
MatrixIndexT N = M->NumCols();
|
||||
|
||||
KALDI_ASSERT(K > 0);
|
||||
KALDI_ASSERT(N > 0);
|
||||
Real normalizer = std::sqrt(1.0 / static_cast<Real>(N)); // normalizer for
|
||||
// X_0.
|
||||
for (MatrixIndexT j = 0; j < N; j++) (*M)(0, j) = normalizer;
|
||||
normalizer = std::sqrt(2.0 / static_cast<Real>(N)); // normalizer for other
|
||||
// elements.
|
||||
for (MatrixIndexT k = 1; k < K; k++)
|
||||
for (MatrixIndexT n = 0; n < N; n++)
|
||||
(*M)(k, n) = normalizer
|
||||
* std::cos( static_cast<double>(M_PI)/N * (n + 0.5) * k );
|
||||
}
|
||||
|
||||
|
||||
template void ComputeDctMatrix(Matrix<float> *M);
|
||||
template void ComputeDctMatrix(Matrix<double> *M);
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void ComputePca(const MatrixBase<Real> &X,
|
||||
MatrixBase<Real> *U,
|
||||
MatrixBase<Real> *A,
|
||||
bool print_eigs,
|
||||
bool exact) {
|
||||
// Note that some of these matrices may be transposed w.r.t. the
|
||||
// way it's most natural to describe them in math... it's the rows
|
||||
// of X and U that correspond to the (data-points, basis elements).
|
||||
MatrixIndexT N = X.NumRows(), D = X.NumCols();
|
||||
// N = #points, D = feature dim.
|
||||
KALDI_ASSERT(U != NULL && U->NumCols() == D);
|
||||
MatrixIndexT G = U->NumRows(); // # of retained basis elements.
|
||||
KALDI_ASSERT(A == NULL || (A->NumRows() == N && A->NumCols() == G));
|
||||
KALDI_ASSERT(G <= N && G <= D);
|
||||
if (D < N) { // Do conventional PCA.
|
||||
SpMatrix<Real> Msp(D); // Matrix of outer products.
|
||||
Msp.AddMat2(1.0, X, kTrans, 0.0); // M <-- X^T X
|
||||
Matrix<Real> Utmp;
|
||||
Vector<Real> l;
|
||||
if (exact) {
|
||||
Utmp.Resize(D, D);
|
||||
l.Resize(D);
|
||||
//Matrix<Real> M(Msp);
|
||||
//M.DestructiveSvd(&l, &Utmp, NULL);
|
||||
Msp.Eig(&l, &Utmp);
|
||||
} else {
|
||||
Utmp.Resize(D, G);
|
||||
l.Resize(G);
|
||||
Msp.TopEigs(&l, &Utmp);
|
||||
}
|
||||
SortSvd(&l, &Utmp);
|
||||
|
||||
for (MatrixIndexT g = 0; g < G; g++)
|
||||
U->Row(g).CopyColFromMat(Utmp, g);
|
||||
if (print_eigs)
|
||||
KALDI_LOG << (exact ? "" : "Retained ")
|
||||
<< "PCA eigenvalues are " << l;
|
||||
if (A != NULL)
|
||||
A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0);
|
||||
} else { // Do inner-product PCA.
|
||||
SpMatrix<Real> Nsp(N); // Matrix of inner products.
|
||||
Nsp.AddMat2(1.0, X, kNoTrans, 0.0); // M <-- X X^T
|
||||
|
||||
Matrix<Real> Vtmp;
|
||||
Vector<Real> l;
|
||||
if (exact) {
|
||||
Vtmp.Resize(N, N);
|
||||
l.Resize(N);
|
||||
Matrix<Real> Nmat(Nsp);
|
||||
Nmat.DestructiveSvd(&l, &Vtmp, NULL);
|
||||
} else {
|
||||
Vtmp.Resize(N, G);
|
||||
l.Resize(G);
|
||||
Nsp.TopEigs(&l, &Vtmp);
|
||||
}
|
||||
|
||||
MatrixIndexT num_zeroed = 0;
|
||||
for (MatrixIndexT g = 0; g < G; g++) {
|
||||
if (l(g) < 0.0) {
|
||||
KALDI_WARN << "In PCA, setting element " << l(g) << " to zero.";
|
||||
l(g) = 0.0;
|
||||
num_zeroed++;
|
||||
}
|
||||
}
|
||||
SortSvd(&l, &Vtmp); // Make sure zero elements are last, this
|
||||
// is necessary for Orthogonalize() to work properly later.
|
||||
|
||||
Vtmp.Transpose(); // So eigenvalues are the rows.
|
||||
|
||||
for (MatrixIndexT g = 0; g < G; g++) {
|
||||
Real sqrtlg = sqrt(l(g));
|
||||
if (l(g) != 0.0) {
|
||||
U->Row(g).AddMatVec(1.0 / sqrtlg, X, kTrans, Vtmp.Row(g), 0.0);
|
||||
} else {
|
||||
U->Row(g).SetZero();
|
||||
(*U)(g, g) = 1.0; // arbitrary direction. Will later orthogonalize.
|
||||
}
|
||||
if (A != NULL)
|
||||
for (MatrixIndexT n = 0; n < N; n++)
|
||||
(*A)(n, g) = sqrtlg * Vtmp(g, n);
|
||||
}
|
||||
// Now orthogonalize. This is mainly useful in
|
||||
// case there were zero eigenvalues, but we do it
|
||||
// for all of them.
|
||||
U->OrthogonalizeRows();
|
||||
if (print_eigs)
|
||||
KALDI_LOG << "(inner-product) PCA eigenvalues are " << l;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template
|
||||
void ComputePca(const MatrixBase<float> &X,
|
||||
MatrixBase<float> *U,
|
||||
MatrixBase<float> *A,
|
||||
bool print_eigs,
|
||||
bool exact);
|
||||
|
||||
template
|
||||
void ComputePca(const MatrixBase<double> &X,
|
||||
MatrixBase<double> *U,
|
||||
MatrixBase<double> *A,
|
||||
bool print_eigs,
|
||||
bool exact);
|
||||
|
||||
|
||||
// Added by Dan, Feb. 13 2012.
|
||||
// This function does: *plus += max(0, a b^T),
|
||||
// *minus += max(0, -(a b^T)).
|
||||
template<typename Real>
|
||||
void AddOuterProductPlusMinus(Real alpha,
|
||||
const VectorBase<Real> &a,
|
||||
const VectorBase<Real> &b,
|
||||
MatrixBase<Real> *plus,
|
||||
MatrixBase<Real> *minus) {
|
||||
KALDI_ASSERT(a.Dim() == plus->NumRows() && b.Dim() == plus->NumCols()
|
||||
&& a.Dim() == minus->NumRows() && b.Dim() == minus->NumCols());
|
||||
int32 nrows = a.Dim(), ncols = b.Dim(), pskip = plus->Stride() - ncols,
|
||||
mskip = minus->Stride() - ncols;
|
||||
const Real *adata = a.Data(), *bdata = b.Data();
|
||||
Real *plusdata = plus->Data(), *minusdata = minus->Data();
|
||||
|
||||
for (int32 i = 0; i < nrows; i++) {
|
||||
const Real *btmp = bdata;
|
||||
Real multiple = alpha * *adata;
|
||||
if (multiple > 0.0) {
|
||||
for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) {
|
||||
if (*btmp > 0.0) *plusdata += multiple * *btmp;
|
||||
else *minusdata -= multiple * *btmp;
|
||||
}
|
||||
} else {
|
||||
for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) {
|
||||
if (*btmp < 0.0) *plusdata += multiple * *btmp;
|
||||
else *minusdata -= multiple * *btmp;
|
||||
}
|
||||
}
|
||||
plusdata += pskip;
|
||||
minusdata += mskip;
|
||||
adata++;
|
||||
}
|
||||
}
|
||||
|
||||
// Instantiate template
|
||||
template
|
||||
void AddOuterProductPlusMinus<float>(float alpha,
|
||||
const VectorBase<float> &a,
|
||||
const VectorBase<float> &b,
|
||||
MatrixBase<float> *plus,
|
||||
MatrixBase<float> *minus);
|
||||
template
|
||||
void AddOuterProductPlusMinus<double>(double alpha,
|
||||
const VectorBase<double> &a,
|
||||
const VectorBase<double> &b,
|
||||
MatrixBase<double> *plus,
|
||||
MatrixBase<double> *minus);
|
||||
|
||||
|
||||
} // end namespace kaldi
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue