From 29af6bde4108e7357607140aa3a0f39aeace1ad1 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Feb 2023 02:30:52 +0000
Subject: [PATCH] opt to compile asr,cls,vad; add vad; format code

---
 runtime/.gitignore                            |    3 +
 runtime/CMakeLists.txt                        |  177 +-
 runtime/build.sh                              |    2 +-
 runtime/cmake/fastdeploy.cmake                |   15 +-
 runtime/engine/CMakeLists.txt                 |   19 +-
 .../engine/asr/recognizer/u2_recognizer.cc    |    3 +-
 runtime/engine/common/CMakeLists.txt          |    2 +-
 runtime/engine/common/base/CMakeLists.txt     |   20 +
 .../common/base/{flags.h => flags.h.in}       |    2 +-
 .../engine/common/base/{log.h => log.h.in}    |    2 +-
 runtime/engine/common/frontend/cmvn.cc        |    2 +-
 .../engine/common/frontend/feature-fbank.h    |    1 +
 .../engine/common/frontend/feature-window.cc  |    1 +
 runtime/engine/common/frontend/rfft.cc        |    4 +-
 .../engine/common/matrix/kaldi-matrix-inl.h   |   37 +-
 runtime/engine/common/matrix/kaldi-matrix.cc  | 1396 ++++++++-------
 runtime/engine/common/matrix/kaldi-matrix.h   | 1443 ++++++++-------
 .../engine/common/matrix/kaldi-vector-inl.h   |   39 +-
 runtime/engine/common/matrix/kaldi-vector.cc  | 1592 +++++++++--------
 runtime/engine/common/matrix/kaldi-vector.h   |  533 +++---
 runtime/engine/common/matrix/matrix-common.h  |   72 +-
 runtime/engine/kaldi/CMakeLists.txt           |   15 +-
 runtime/engine/kaldi/base/kaldi-types.h       |   12 +
 runtime/engine/vad/CMakeLists.txt             |   18 +
 runtime/engine/vad/README.md                  |  121 ++
 runtime/engine/vad/README_CN.md               |  119 ++
 runtime/engine/vad/infer_onnx_silero_vad.cc   |   65 +
 runtime/engine/vad/vad.cc                     |  306 ++++
 runtime/engine/vad/vad.h                      |  124 ++
 runtime/engine/vad/wav.h                      |  197 ++
 30 files changed, 3811 insertions(+), 2531 deletions(-)
 create mode 100644 runtime/engine/common/base/CMakeLists.txt
 rename runtime/engine/common/base/{flags.h => flags.h.in} (96%)
 rename runtime/engine/common/base/{log.h => log.h.in} (96%)
 create mode 100644 runtime/engine/vad/CMakeLists.txt
 create mode 100644 runtime/engine/vad/README.md
 create mode 100644 runtime/engine/vad/README_CN.md
 create mode 100644 runtime/engine/vad/infer_onnx_silero_vad.cc
 create mode 100644 runtime/engine/vad/vad.cc
 create mode 100644 runtime/engine/vad/vad.h
 create mode 100644 runtime/engine/vad/wav.h

diff --git a/runtime/.gitignore b/runtime/.gitignore
index 0783b138c..9aa98ef7d 100644
--- a/runtime/.gitignore
+++ b/runtime/.gitignore
@@ -1,3 +1,6 @@
+engine/common/base/flags.h
+engine/common/base/log.h
+
 tools/valgrind*
 *log
 fc_patch/*
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 44ee3a589..015a1088d 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -20,8 +20,7 @@ project(paddlespeech VERSION 0.1)
 
 set(CMAKE_VERBOSE_MAKEFILE on)
 
-# set std-14
-set(CMAKE_CXX_STANDARD 14)
+
 
 include(FetchContent)
 include(ExternalProject)
@@ -31,15 +30,28 @@ set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 
+set(CMAKE_CXX_FLAGS)
+set(CMAKE_CXX_FLAGS_DEBUG)
+set(CMAKE_CXX_FLAGS_RELEASE)
+
+# set std-14
+set(CMAKE_CXX_STANDARD 14)
+
 # compiler option
 # Keep the same with openfst, -fPIC or -fpic
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ldl")
 SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ggdb")
 SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall")
 
+
+add_compile_options(-fPIC)
 ###############################################################################
 # Option Configurations
 ###############################################################################
+option(WITH_ASR "build asr" ON)
+option(WITH_CLS "build cls" ON)
+option(WITH_VAD "build vad" ON)
+
 option(TEST_DEBUG "option for debug" OFF)
 option(USE_PROFILING "enable c++ profling" OFF)
 option(WITH_TESTING "unit test" ON)
@@ -47,102 +59,117 @@ option(WITH_TESTING "unit test" ON)
 option(USING_GPU "u2 compute on GPU." OFF)
 
 ###############################################################################
-# Include third party
+# Include Third Party
 ###############################################################################
 include(gflags)
 
 include(glog)
 
-# openfst
-include(openfst)
-add_dependencies(openfst gflags glog)
-
-# paddle lib
-include(paddleinference)
-
 # gtest
 if(WITH_TESTING)
     include(gtest) # download, build, install gtest
 endif()
 
+
+# fastdeploy
+include(fastdeploy)
+
+if(WITH_ASR)
+    # openfst
+    include(openfst)
+    add_dependencies(openfst gflags glog)
+endif()
+
+###############################################################################
+# Find Package
+###############################################################################
+
 # python/pybind11/threads
 find_package(Threads REQUIRED)
 # https://cmake.org/cmake/help/latest/module/FindPython3.html#module:FindPython3
 find_package(Python3 COMPONENTS Interpreter Development)
 find_package(pybind11 CONFIG)
 
-if(Python3_FOUND)
-  message(STATUS "Python3_FOUND = ${Python3_FOUND}")
-  message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
-  message(STATUS "Python3_LIBRARIES = ${Python3_LIBRARIES}")
-  message(STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}")
-  message(STATUS "Python3_LINK_OPTIONS = ${Python3_LINK_OPTIONS}")
-  set(PYTHON_LIBRARIES ${Python3_LIBRARIES} CACHE STRING "python lib" FORCE)
-  set(PYTHON_INCLUDE_DIR ${Python3_INCLUDE_DIRS} CACHE STRING "python inc" FORCE)
-endif()
-
-message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
-message(STATUS "PYTHON_INCLUDE_DIR = ${PYTHON_INCLUDE_DIR}")
 
-if(pybind11_FOUND)
-  message(STATUS "pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}")
-  message(STATUS "pybind11_LIBRARIES=${pybind11_LIBRARIES}")
-  message(STATUS "pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
+if(WITH_ASR)
+    if(Python3_FOUND)
+    message(STATUS "Python3_FOUND = ${Python3_FOUND}")
+    message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
+    message(STATUS "Python3_LIBRARIES = ${Python3_LIBRARIES}")
+    message(STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}")
+    message(STATUS "Python3_LINK_OPTIONS = ${Python3_LINK_OPTIONS}")
+    set(PYTHON_LIBRARIES ${Python3_LIBRARIES} CACHE STRING "python lib" FORCE)
+    set(PYTHON_INCLUDE_DIR ${Python3_INCLUDE_DIRS} CACHE STRING "python inc" FORCE)
+    endif()
+
+    message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
+    message(STATUS "PYTHON_INCLUDE_DIR = ${PYTHON_INCLUDE_DIR}")
+
+    if(pybind11_FOUND)
+    message(STATUS "pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}")
+    message(STATUS "pybind11_LIBRARIES=${pybind11_LIBRARIES}")
+    message(STATUS "pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
+    endif()
+
+
+    # paddle libpaddle.so
+    # paddle include and link option
+    # -L/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
+    execute_process(
+        COMMAND python -c "\
+    import os;\
+    import paddle;\
+    include_dir=paddle.sysconfig.get_include();\
+    paddle_dir=os.path.split(include_dir)[0];\
+    libs_dir=os.path.join(paddle_dir, 'libs');\
+    fluid_dir=os.path.join(paddle_dir, 'fluid');\
+    out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\
+    out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\
+        "
+        OUTPUT_VARIABLE PADDLE_LINK_FLAGS
+        RESULT_VARIABLE SUCESS)
+
+    message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
+    string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
+
+    # paddle compile option
+    # -I/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/include
+    execute_process(
+        COMMAND python -c "\
+    import paddle; \
+    include_dir = paddle.sysconfig.get_include(); \
+    print(f\"-I{include_dir}\"); \
+        "
+        OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
+    message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
+    string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
+
+
+    # for LD_LIBRARY_PATH
+    # set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
+    execute_process(
+        COMMAND python -c "\
+    import os; \
+    import paddle; \
+    include_dir=paddle.sysconfig.get_include(); \
+    paddle_dir=os.path.split(include_dir)[0]; \
+    libs_dir=os.path.join(paddle_dir, 'libs'); \
+    fluid_dir=os.path.join(paddle_dir, 'fluid'); \
+    out=':'.join([libs_dir, fluid_dir]); print(out); \
+        "
+        OUTPUT_VARIABLE PADDLE_LIB_DIRS)
+    message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
 endif()
 
 
-# paddle libpaddle.so
-# paddle include and link option
-# -L/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
-execute_process(
-    COMMAND python -c "\
-import os;\
-import paddle;\
-include_dir=paddle.sysconfig.get_include();\
-paddle_dir=os.path.split(include_dir)[0];\
-libs_dir=os.path.join(paddle_dir, 'libs');\
-fluid_dir=os.path.join(paddle_dir, 'fluid');\
-out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\
-out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\
-    "
-    OUTPUT_VARIABLE PADDLE_LINK_FLAGS
-    RESULT_VARIABLE SUCESS)
-
-message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
-string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
-
-# paddle compile option
-# -I/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/include
-execute_process(
-    COMMAND python -c "\
-import paddle; \
-include_dir = paddle.sysconfig.get_include(); \
-print(f\"-I{include_dir}\"); \
-    "
-    OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
-message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
-string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
-
-
-# for LD_LIBRARY_PATH
-# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
-execute_process(
-    COMMAND python -c "\
-import os; \
-import paddle; \
-include_dir=paddle.sysconfig.get_include(); \
-paddle_dir=os.path.split(include_dir)[0]; \
-libs_dir=os.path.join(paddle_dir, 'libs'); \
-fluid_dir=os.path.join(paddle_dir, 'fluid'); \
-out=':'.join([libs_dir, fluid_dir]); print(out); \
-    "
-    OUTPUT_VARIABLE PADDLE_LIB_DIRS)
-message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
-
-add_compile_options(-fPIC)
 ###############################################################################
 # Add local library
 ###############################################################################
 set(ENGINE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/engine)
 
+message(STATUS "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
+message(STATUS "CMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}")
+message(STATUS "CMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}")
+
+
 add_subdirectory(engine)
diff --git a/runtime/build.sh b/runtime/build.sh
index 94d250f5a..131fb7f1a 100755
--- a/runtime/build.sh
+++ b/runtime/build.sh
@@ -4,5 +4,5 @@ set -xe
 # the build script had verified in the paddlepaddle docker image.
 # please follow the instruction below to install PaddlePaddle image.
 # https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html 
-cmake -B build
+cmake -B build -DWITH_ASR=OFF -DWITH_CLS=OFF
 cmake --build build -j
diff --git a/runtime/cmake/fastdeploy.cmake b/runtime/cmake/fastdeploy.cmake
index 773414c19..cb9ceacda 100644
--- a/runtime/cmake/fastdeploy.cmake
+++ b/runtime/cmake/fastdeploy.cmake
@@ -8,11 +8,11 @@ windows_x86")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 set(FASTDEPLOY_DIR ${CMAKE_SOURCE_DIR}/fc_patch/fastdeploy)
-if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.2.tgz)
+if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4.tgz)
     exec_program("mkdir -p ${FASTDEPLOY_DIR} &&
-    wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.2.tgz -P ${FASTDEPLOY_DIR} &&
-    tar xzvf ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.2.tgz -C ${FASTDEPLOY_DIR} &&
-    mv ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.2 ${FASTDEPLOY_DIR}/linux-x64")
+    wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.4.tgz -P ${FASTDEPLOY_DIR} &&
+    tar xzvf ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4.tgz -C ${FASTDEPLOY_DIR} &&
+    mv ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4 ${FASTDEPLOY_DIR}/linux-x64")
 endif()
 
 if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-android-1.0.0-shared.tgz)
@@ -36,4 +36,9 @@ elseif (ARCH STREQUAL "android_armv7")
 endif()
 
 include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
-include_directories(${FASTDEPLOY_INCS})
\ No newline at end of file
+
+# fix compiler flags conflict, since fastdeploy using c++11 for project
+set(CMAKE_CXX_STANDARD 14)
+
+include_directories(${FASTDEPLOY_INCS})
+message(STATUS "FASTDEPLOY_INCS=${FASTDEPLOY_INCS}")
\ No newline at end of file
diff --git a/runtime/engine/CMakeLists.txt b/runtime/engine/CMakeLists.txt
index 42399fe92..242a579b5 100644
--- a/runtime/engine/CMakeLists.txt
+++ b/runtime/engine/CMakeLists.txt
@@ -6,8 +6,19 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/kaldi)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/common)
 
-add_subdirectory(asr)
-add_subdirectory(common)
 add_subdirectory(kaldi)
-add_subdirectory(codelab)
-add_subdirectory(cls)
\ No newline at end of file
+add_subdirectory(common)
+
+if(WITH_ASR)
+    add_subdirectory(asr)
+endif()
+
+if(WITH_CLS)
+    add_subdirectory(cls)
+endif()
+
+if(WITH_VAD)
+    add_subdirectory(vad)
+endif()
+
+add_subdirectory(codelab)
\ No newline at end of file
diff --git a/runtime/engine/asr/recognizer/u2_recognizer.cc b/runtime/engine/asr/recognizer/u2_recognizer.cc
index da1348f55..36fecb0a2 100644
--- a/runtime/engine/asr/recognizer/u2_recognizer.cc
+++ b/runtime/engine/asr/recognizer/u2_recognizer.cc
@@ -38,7 +38,8 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource)
         decoder_ = std::make_unique<CTCPrefixBeamSearch>(
             resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts);
     } else {
-        decoder_ = std::make_unique<TLGDecoder>(resource.decoder_opts.tlg_decoder_opts);
+        decoder_ = std::make_unique<TLGDecoder>(
+            resource.decoder_opts.tlg_decoder_opts);
     }
 
     symbol_table_ = decoder_->WordSymbolTable();
diff --git a/runtime/engine/common/CMakeLists.txt b/runtime/engine/common/CMakeLists.txt
index 5e0a7d573..4f399eea8 100644
--- a/runtime/engine/common/CMakeLists.txt
+++ b/runtime/engine/common/CMakeLists.txt
@@ -3,7 +3,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/../
 )
 add_subdirectory(utils)
-
+add_subdirectory(base)
 add_subdirectory(matrix)
 
 include_directories(
diff --git a/runtime/engine/common/base/CMakeLists.txt b/runtime/engine/common/base/CMakeLists.txt
new file mode 100644
index 000000000..ab7108748
--- /dev/null
+++ b/runtime/engine/common/base/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(WITH_ASR)
+  add_compile_options(-DWITH_ASR)
+  set(PPS_FLAGS_LIB "fst/flags.h")
+  set(PPS_GLOB_LIB "fst/log.h")
+else()
+  set(PPS_FLAGS_LIB "gflags/gflags.h")
+  set(PPS_GLOB_LIB "glog/logging.h")
+endif()
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/flags.h.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/flags.h @ONLY
+  )
+message(STATUS "Generated ${CMAKE_CURRENT_SOURCE_DIR}/flags.h")
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/log.h.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/log.h @ONLY
+  )
+message(STATUS "Generated ${CMAKE_CURRENT_SOURCE_DIR}/log.h")
\ No newline at end of file
diff --git a/runtime/engine/common/base/flags.h b/runtime/engine/common/base/flags.h.in
similarity index 96%
rename from runtime/engine/common/base/flags.h
rename to runtime/engine/common/base/flags.h.in
index 41df0d452..161366e8a 100644
--- a/runtime/engine/common/base/flags.h
+++ b/runtime/engine/common/base/flags.h.in
@@ -14,4 +14,4 @@
 
 #pragma once
 
-#include "fst/flags.h"
+#include "@PPS_FLAGS_LIB@"
\ No newline at end of file
diff --git a/runtime/engine/common/base/log.h b/runtime/engine/common/base/log.h.in
similarity index 96%
rename from runtime/engine/common/base/log.h
rename to runtime/engine/common/base/log.h.in
index c613b98c3..0dd588bc4 100644
--- a/runtime/engine/common/base/log.h
+++ b/runtime/engine/common/base/log.h.in
@@ -14,4 +14,4 @@
 
 #pragma once
 
-#include "fst/log.h"
+#include "@PPS_GLOB_LIB@"
diff --git a/runtime/engine/common/frontend/cmvn.cc b/runtime/engine/common/frontend/cmvn.cc
index 8375d3d16..0f1108208 100644
--- a/runtime/engine/common/frontend/cmvn.cc
+++ b/runtime/engine/common/frontend/cmvn.cc
@@ -33,7 +33,7 @@ CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
     dim_ = mean_stats_.size() - 1;
 }
 
-void CMVN::ReadCMVNFromJson(string cmvn_file) {
+void CMVN::ReadCMVNFromJson(std::string cmvn_file) {
     std::string json_str = ppspeech::ReadFile2String(cmvn_file);
     picojson::value value;
     std::string err;
diff --git a/runtime/engine/common/frontend/feature-fbank.h b/runtime/engine/common/frontend/feature-fbank.h
index 30085245d..3dab793f3 100644
--- a/runtime/engine/common/frontend/feature-fbank.h
+++ b/runtime/engine/common/frontend/feature-fbank.h
@@ -21,6 +21,7 @@
 #ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
 #define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
 
+#include <limits>
 #include <map>
 
 #include "frontend/feature-window.h"
diff --git a/runtime/engine/common/frontend/feature-window.cc b/runtime/engine/common/frontend/feature-window.cc
index 1c474ccb5..43c736e01 100644
--- a/runtime/engine/common/frontend/feature-window.cc
+++ b/runtime/engine/common/frontend/feature-window.cc
@@ -7,6 +7,7 @@
 #include "frontend/feature-window.h"
 
 #include <cmath>
+#include <limits>
 #include <vector>
 
 #ifndef M_2PI
diff --git a/runtime/engine/common/frontend/rfft.cc b/runtime/engine/common/frontend/rfft.cc
index f0a3ebc7a..8cdb634ff 100644
--- a/runtime/engine/common/frontend/rfft.cc
+++ b/runtime/engine/common/frontend/rfft.cc
@@ -17,12 +17,12 @@
  */
 
 #include "frontend/rfft.h"
+#include "base/log.h"
 
 #include <cmath>
+#include <memory>
 #include <vector>
 
-#include "base/log.h"
-
 // see fftsg.c
 #ifdef __cplusplus
 extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
diff --git a/runtime/engine/common/matrix/kaldi-matrix-inl.h b/runtime/engine/common/matrix/kaldi-matrix-inl.h
index eafbc6fb4..ed18859de 100644
--- a/runtime/engine/common/matrix/kaldi-matrix-inl.h
+++ b/runtime/engine/common/matrix/kaldi-matrix-inl.h
@@ -25,40 +25,41 @@
 namespace kaldi {
 
 /// Empty constructor
-template<typename Real>
-Matrix<Real>::Matrix(): MatrixBase<Real>(NULL, 0, 0, 0) { }
+template <typename Real>
+Matrix<Real>::Matrix() : MatrixBase<Real>(NULL, 0, 0, 0) {}
 
 /*
 template<>
 template<>
-void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra, const VectorBase<float> &rb);
+void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float>
+&ra, const VectorBase<float> &rb);
 
 template<>
 template<>
-void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
+void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double>
+&ra, const VectorBase<double> &rb);
 */
 
-template<typename Real>
-inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {
-  M.Write(os, false);
-  return os;
+template <typename Real>
+inline std::ostream& operator<<(std::ostream& os, const MatrixBase<Real>& M) {
+    M.Write(os, false);
+    return os;
 }
 
-template<typename Real>
-inline std::istream & operator >> (std::istream & is, Matrix<Real> & M) {
-  M.Read(is, false);
-  return is;
+template <typename Real>
+inline std::istream& operator>>(std::istream& is, Matrix<Real>& M) {
+    M.Read(is, false);
+    return is;
 }
 
 
-template<typename Real>
-inline std::istream & operator >> (std::istream & is, MatrixBase<Real> & M) {
-  M.Read(is, false);
-  return is;
+template <typename Real>
+inline std::istream& operator>>(std::istream& is, MatrixBase<Real>& M) {
+    M.Read(is, false);
+    return is;
 }
 
-}// namespace kaldi
+}  // namespace kaldi
 
 
 #endif  // KALDI_MATRIX_KALDI_MATRIX_INL_H_
-
diff --git a/runtime/engine/common/matrix/kaldi-matrix.cc b/runtime/engine/common/matrix/kaldi-matrix.cc
index e446a6bf7..6f65fb0a0 100644
--- a/runtime/engine/common/matrix/kaldi-matrix.cc
+++ b/runtime/engine/common/matrix/kaldi-matrix.cc
@@ -166,14 +166,19 @@ void MatrixBase<Real>::AddMatMat(const Real alpha,
                                   const MatrixBase<Real>& B,
                                   MatrixTransposeType transB,
                                   const Real beta) {
-  KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
-               || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
-               || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
-               || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
+  KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ ==
+B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
+               || (transA == kTrans && transB == kNoTrans && A.num_rows_ ==
+B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
+               || (transA == kNoTrans && transB == kTrans && A.num_cols_ ==
+B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
+               || (transA == kTrans && transB == kTrans && A.num_rows_ ==
+B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);
   if (num_rows_ == 0) return;
   cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
-              transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
+              transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_,
+stride_);
 
 }
 
@@ -191,7 +196,8 @@ void MatrixBase<Real>::SetMatMatDivMat(const MatrixBase<Real>& A,
         id = od * (o / i); /// o / i is either zero or "scale".
       } else {
         id = od; /// Just imagine the scale was 1.0.  This is somehow true in
-        /// expectation; anyway, this case should basically never happen so it doesn't
+        /// expectation; anyway, this case should basically never happen so it
+doesn't
         /// really matter.
       }
       (*this)(r, c) = id;
@@ -200,25 +206,25 @@ void MatrixBase<Real>::SetMatMatDivMat(const MatrixBase<Real>& A,
 }
 */
 
-//template<typename Real>
-//void MatrixBase<Real>::CopyLowerToUpper() {
-  //KALDI_ASSERT(num_rows_ == num_cols_);
-  //Real *data = data_;
-  //MatrixIndexT num_rows = num_rows_, stride = stride_;
-  //for (int32 i = 0; i < num_rows; i++)
-    //for (int32 j = 0; j < i; j++)
-      //data[j * stride + i ] = data[i * stride + j];
+// template<typename Real>
+// void MatrixBase<Real>::CopyLowerToUpper() {
+// KALDI_ASSERT(num_rows_ == num_cols_);
+// Real *data = data_;
+// MatrixIndexT num_rows = num_rows_, stride = stride_;
+// for (int32 i = 0; i < num_rows; i++)
+// for (int32 j = 0; j < i; j++)
+// data[j * stride + i ] = data[i * stride + j];
 //}
 
 
-//template<typename Real>
-//void MatrixBase<Real>::CopyUpperToLower() {
-  //KALDI_ASSERT(num_rows_ == num_cols_);
-  //Real *data = data_;
-  //MatrixIndexT num_rows = num_rows_, stride = stride_;
-  //for (int32 i = 0; i < num_rows; i++)
-    //for (int32 j = 0; j < i; j++)
-      //data[i * stride + j] = data[j * stride + i];
+// template<typename Real>
+// void MatrixBase<Real>::CopyUpperToLower() {
+// KALDI_ASSERT(num_rows_ == num_cols_);
+// Real *data = data_;
+// MatrixIndexT num_rows = num_rows_, stride = stride_;
+// for (int32 i = 0; i < num_rows; i++)
+// for (int32 j = 0; j < i; j++)
+// data[i * stride + j] = data[j * stride + i];
 //}
 
 /*
@@ -263,10 +269,14 @@ void MatrixBase<Real>::AddMatSmat(const Real alpha,
                                   const MatrixBase<Real> &B,
                                   MatrixTransposeType transB,
                                   const Real beta) {
-  KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
-               || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
-               || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
-               || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
+  KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ ==
+B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
+               || (transA == kTrans && transB == kNoTrans && A.num_rows_ ==
+B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
+               || (transA == kNoTrans && transB == kTrans && A.num_cols_ ==
+B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
+               || (transA == kTrans && transB == kTrans && A.num_rows_ ==
+B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);
 
   // We iterate over the columns of B.
@@ -301,10 +311,14 @@ void MatrixBase<Real>::AddSmatMat(const Real alpha,
                                   const MatrixBase<Real> &B,
                                   MatrixTransposeType transB,
                                   const Real beta) {
-  KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
-               || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
-               || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
-               || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
+  KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ ==
+B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
+               || (transA == kTrans && transB == kNoTrans && A.num_rows_ ==
+B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
+               || (transA == kNoTrans && transB == kTrans && A.num_cols_ ==
+B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
+               || (transA == kTrans && transB == kTrans && A.num_rows_ ==
+B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);
 
   MatrixIndexT Astride = A.stride_, Bstride = B.stride_, stride = this->stride_,
@@ -342,7 +356,8 @@ void MatrixBase<Real>::AddSpSp(const Real alpha, const SpMatrix<Real> &A_in,
   // fully (to save work, we used the matrix constructor from SpMatrix).
   // CblasLeft means A is on the left: C <-- alpha A B + beta C
   if (sz == 0) return;
-  cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_, stride_);
+  cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_,
+stride_);
 }
 
 template<typename Real>
@@ -352,13 +367,15 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
     if (transA == kNoTrans) {
       Scale(alpha + 1.0);
     } else {
-      KALDI_ASSERT(num_rows_ == num_cols_ && "AddMat: adding to self (transposed): not symmetric.");
+      KALDI_ASSERT(num_rows_ == num_cols_ && "AddMat: adding to self
+(transposed): not symmetric.");
       Real *data = data_;
       if (alpha == 1.0) {  // common case-- handle separately.
         for (MatrixIndexT row = 0; row < num_rows_; row++) {
           for (MatrixIndexT col = 0; col < row; col++) {
             Real *lower = data + (row * stride_) + col, *upper = data + (col
-                                                                          * stride_) + row;
+                                                                          *
+stride_) + row;
             Real sum = *lower + *upper;
             *lower = *upper = sum;
           }
@@ -368,7 +385,8 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
         for (MatrixIndexT row = 0; row < num_rows_; row++) {
           for (MatrixIndexT col = 0; col < row; col++) {
             Real *lower = data + (row * stride_) + col, *upper = data + (col
-                                                                          * stride_) + row;
+                                                                          *
+stride_) + row;
             Real lower_tmp = *lower;
             *lower += alpha * *upper;
             *upper += alpha * lower_tmp;
@@ -390,7 +408,8 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
     } else {
       KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_);
       if (num_rows_ == 0) return;
-      for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride)
+      for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data +=
+stride)
         cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1);
     }
   }
@@ -503,7 +522,8 @@ void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
         Real alpha_B_kj = alpha * p.second;
         Real *this_col_j = this->Data() + j;
         // Add to entire 'j'th column of *this at once using cblas_Xaxpy.
-        // pass stride to write a colmun as matrices are stored in row major order.
+        // pass stride to write a colmun as matrices are stored in row major
+order.
         cblas_Xaxpy(this_num_rows, alpha_B_kj, a_col_k, A.stride_,
                     this_col_j, this->stride_);
         //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
@@ -529,10 +549,11 @@ void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
         Real alpha_B_jk = alpha * p.second;
         const Real *a_col_k = A.Data() + k;
         // Add to entire 'j'th column of *this at once using cblas_Xaxpy.
-        // pass stride to write a column as matrices are stored in row major order.
+        // pass stride to write a column as matrices are stored in row major
+order.
         cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_,
                     this_col_j, this->stride_);
-        //for (MatrixIndexT i = 0; i < this_num_rows; ++i) 
+        //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_jk * a_col_k[i*A.stride_];
       }
     }
@@ -586,7 +607,8 @@ void MatrixBase<Real>::AddDiagVecMat(
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
   if (num_rows_ == 0) return;
-  for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
+  for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata +=
+M_row_stride, vdata++)
     cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
 }
 
@@ -620,7 +642,8 @@ void MatrixBase<Real>::AddMatDiagVec(
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++){
       for(MatrixIndexT j = 0; j < num_cols; j ++ ){
-          data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
+          data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride +
+j*M_col_stride];
       }
   }
 }
@@ -655,8 +678,10 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
   KALDI_ASSERT(s != NULL && U_in != this && V_in != this);
 
   Matrix<Real> tmpU, tmpV;
-  if (U_in == NULL) tmpU.Resize(this->num_rows_, 1);  // work-space if U_in empty.
-  if (V_in == NULL) tmpV.Resize(1, this->num_cols_);  // work-space if V_in empty.
+  if (U_in == NULL) tmpU.Resize(this->num_rows_, 1);  // work-space if U_in
+empty.
+  if (V_in == NULL) tmpV.Resize(1, this->num_cols_);  // work-space if V_in
+empty.
 
   /// Impementation notes:
   /// Lapack works in column-order, therefore the dimensions of *this are
@@ -690,8 +715,10 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
   KaldiBlasInt result;
 
   // query for work space
-  char *u_job = const_cast<char*>(U_in ? "s" : "N");  // "s" == skinny, "N" == "none."
-  char *v_job = const_cast<char*>(V_in ? "s" : "N");  // "s" == skinny, "N" == "none."
+  char *u_job = const_cast<char*>(U_in ? "s" : "N");  // "s" == skinny, "N" ==
+"none."
+  char *v_job = const_cast<char*>(V_in ? "s" : "N");  // "s" == skinny, "N" ==
+"none."
   clapack_Xgesvd(v_job, u_job,
                  &M, &N, data_, &LDA,
                  s->Data(),
@@ -700,7 +727,8 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                  &work_query, &l_work,
                  &result);
 
-  KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments");
+  KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong
+arguments");
 
   l_work = static_cast<KaldiBlasInt>(work_query);
   Real *p_work;
@@ -718,7 +746,8 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                  p_work, &l_work,
                  &result);
 
-  KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments");
+  KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong
+arguments");
 
   if (result != 0) {
     KALDI_WARN << "CLAPACK sgesvd_ : some weird convergence not satisfied";
@@ -729,167 +758,166 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
 #endif
 */
 // Copy constructor.  Copies data to newly allocated memory.
-template<typename Real>
-Matrix<Real>::Matrix (const MatrixBase<Real> & M,
-                      MatrixTransposeType trans/*=kNoTrans*/)
+template <typename Real>
+Matrix<Real>::Matrix(const MatrixBase<Real> &M,
+                     MatrixTransposeType trans /*=kNoTrans*/)
     : MatrixBase<Real>() {
-  if (trans == kNoTrans) {
-    Resize(M.num_rows_, M.num_cols_);
-    this->CopyFromMat(M);
-  } else {
-    Resize(M.num_cols_, M.num_rows_);
-    this->CopyFromMat(M, kTrans);
-  }
+    if (trans == kNoTrans) {
+        Resize(M.num_rows_, M.num_cols_);
+        this->CopyFromMat(M);
+    } else {
+        Resize(M.num_cols_, M.num_rows_);
+        this->CopyFromMat(M, kTrans);
+    }
 }
 
 // Copy constructor.  Copies data to newly allocated memory.
-template<typename Real>
-Matrix<Real>::Matrix (const Matrix<Real> & M):
-    MatrixBase<Real>() {
-  Resize(M.num_rows_, M.num_cols_);
-  this->CopyFromMat(M);
+template <typename Real>
+Matrix<Real>::Matrix(const Matrix<Real> &M) : MatrixBase<Real>() {
+    Resize(M.num_rows_, M.num_cols_);
+    this->CopyFromMat(M);
 }
 
 /// Copy constructor from another type.
-template<typename Real>
-template<typename OtherReal>
-Matrix<Real>::Matrix(const MatrixBase<OtherReal> & M,
-                     MatrixTransposeType trans) : MatrixBase<Real>() {
-  if (trans == kNoTrans) {
-    Resize(M.NumRows(), M.NumCols());
-    this->CopyFromMat(M);
-  } else {
-    Resize(M.NumCols(), M.NumRows());
-    this->CopyFromMat(M, kTrans);
-  }
+template <typename Real>
+template <typename OtherReal>
+Matrix<Real>::Matrix(const MatrixBase<OtherReal> &M, MatrixTransposeType trans)
+    : MatrixBase<Real>() {
+    if (trans == kNoTrans) {
+        Resize(M.NumRows(), M.NumCols());
+        this->CopyFromMat(M);
+    } else {
+        Resize(M.NumCols(), M.NumRows());
+        this->CopyFromMat(M, kTrans);
+    }
 }
 
 // Instantiate this constructor for float->double and double->float.
-template
-Matrix<float>::Matrix(const MatrixBase<double> & M,
-                      MatrixTransposeType trans);
-template
-Matrix<double>::Matrix(const MatrixBase<float> & M,
-                       MatrixTransposeType trans);
+template Matrix<float>::Matrix(const MatrixBase<double> &M,
+                               MatrixTransposeType trans);
+template Matrix<double>::Matrix(const MatrixBase<float> &M,
+                                MatrixTransposeType trans);
 
-template<typename Real>
+template <typename Real>
 inline void Matrix<Real>::Init(const MatrixIndexT rows,
                                const MatrixIndexT cols,
                                const MatrixStrideType stride_type) {
-  if (rows * cols == 0) {
-    KALDI_ASSERT(rows == 0 && cols == 0);
-    this->num_rows_ = 0;
-    this->num_cols_ = 0;
-    this->stride_ = 0;
-    this->data_ = NULL;
-    return;
-  }
-  KALDI_ASSERT(rows > 0 && cols > 0);
-  MatrixIndexT skip, stride;
-  size_t size;
-  void *data;  // aligned memory block
-  void *temp;  // memory block to be really freed
-
-  // compute the size of skip and real cols
-  skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real)))
-      % (16 / sizeof(Real));
-  stride = cols + skip;
-  size = static_cast<size_t>(rows) * static_cast<size_t>(stride)
-      * sizeof(Real);
-
-  // allocate the memory and set the right dimensions and parameters
-  if (NULL != (data = KALDI_MEMALIGN(16, size, &temp))) {
-    MatrixBase<Real>::data_        = static_cast<Real *> (data);
-    MatrixBase<Real>::num_rows_      = rows;
-    MatrixBase<Real>::num_cols_      = cols;
-    MatrixBase<Real>::stride_  = (stride_type == kDefaultStride ? stride : cols);
-  } else {
-    throw std::bad_alloc();
-  }
+    if (rows * cols == 0) {
+        KALDI_ASSERT(rows == 0 && cols == 0);
+        this->num_rows_ = 0;
+        this->num_cols_ = 0;
+        this->stride_ = 0;
+        this->data_ = NULL;
+        return;
+    }
+    KALDI_ASSERT(rows > 0 && cols > 0);
+    MatrixIndexT skip, stride;
+    size_t size;
+    void *data;  // aligned memory block
+    void *temp;  // memory block to be really freed
+
+    // compute the size of skip and real cols
+    skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real))) %
+           (16 / sizeof(Real));
+    stride = cols + skip;
+    size =
+        static_cast<size_t>(rows) * static_cast<size_t>(stride) * sizeof(Real);
+
+    // allocate the memory and set the right dimensions and parameters
+    if (NULL != (data = KALDI_MEMALIGN(16, size, &temp))) {
+        MatrixBase<Real>::data_ = static_cast<Real *>(data);
+        MatrixBase<Real>::num_rows_ = rows;
+        MatrixBase<Real>::num_cols_ = cols;
+        MatrixBase<Real>::stride_ =
+            (stride_type == kDefaultStride ? stride : cols);
+    } else {
+        throw std::bad_alloc();
+    }
 }
 
-template<typename Real>
+template <typename Real>
 void Matrix<Real>::Resize(const MatrixIndexT rows,
                           const MatrixIndexT cols,
                           MatrixResizeType resize_type,
                           MatrixStrideType stride_type) {
-  // the next block uses recursion to handle what we have to do if
-  // resize_type == kCopyData.
-  if (resize_type == kCopyData) {
-    if (this->data_ == NULL || rows == 0) resize_type = kSetZero;  // nothing to copy.
-    else if (rows == this->num_rows_ && cols == this->num_cols_ &&
-	     (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do.
-    else {
-      // set tmp to a matrix of the desired size; if new matrix
-      // is bigger in some dimension, zero it.
-      MatrixResizeType new_resize_type =
-          (rows > this->num_rows_ || cols > this->num_cols_) ? kSetZero : kUndefined;
-      Matrix<Real> tmp(rows, cols, new_resize_type, stride_type);
-      MatrixIndexT rows_min = std::min(rows, this->num_rows_),
-          cols_min = std::min(cols, this->num_cols_);
-      tmp.Range(0, rows_min, 0, cols_min).
-          CopyFromMat(this->Range(0, rows_min, 0, cols_min));
-      tmp.Swap(this);
-      // and now let tmp go out of scope, deleting what was in *this.
-      return;
+    // the next block uses recursion to handle what we have to do if
+    // resize_type == kCopyData.
+    if (resize_type == kCopyData) {
+        if (this->data_ == NULL || rows == 0)
+            resize_type = kSetZero;  // nothing to copy.
+        else if (rows == this->num_rows_ && cols == this->num_cols_ &&
+                 (stride_type == kDefaultStride ||
+                  this->stride_ == this->num_cols_)) {
+            return;
+        }  // nothing to do.
+        else {
+            // set tmp to a matrix of the desired size; if new matrix
+            // is bigger in some dimension, zero it.
+            MatrixResizeType new_resize_type =
+                (rows > this->num_rows_ || cols > this->num_cols_) ? kSetZero
+                                                                   : kUndefined;
+            Matrix<Real> tmp(rows, cols, new_resize_type, stride_type);
+            MatrixIndexT rows_min = std::min(rows, this->num_rows_),
+                         cols_min = std::min(cols, this->num_cols_);
+            tmp.Range(0, rows_min, 0, cols_min)
+                .CopyFromMat(this->Range(0, rows_min, 0, cols_min));
+            tmp.Swap(this);
+            // and now let tmp go out of scope, deleting what was in *this.
+            return;
+        }
     }
-  }
-  // At this point, resize_type == kSetZero or kUndefined.
+    // At this point, resize_type == kSetZero or kUndefined.
 
-  if (MatrixBase<Real>::data_ != NULL) {
-    if (rows == MatrixBase<Real>::num_rows_
-        && cols == MatrixBase<Real>::num_cols_) {
-      if (resize_type == kSetZero)
-        this->SetZero();
-      return;
+    if (MatrixBase<Real>::data_ != NULL) {
+        if (rows == MatrixBase<Real>::num_rows_ &&
+            cols == MatrixBase<Real>::num_cols_) {
+            if (resize_type == kSetZero) this->SetZero();
+            return;
+        } else
+            Destroy();
     }
-    else
-      Destroy();
-  }
-  Init(rows, cols, stride_type);
-  if (resize_type == kSetZero) MatrixBase<Real>::SetZero();
+    Init(rows, cols, stride_type);
+    if (resize_type == kSetZero) MatrixBase<Real>::SetZero();
 }
 
-template<typename Real>
-template<typename OtherReal>
+template <typename Real>
+template <typename OtherReal>
 void MatrixBase<Real>::CopyFromMat(const MatrixBase<OtherReal> &M,
                                    MatrixTransposeType Trans) {
-  if (sizeof(Real) == sizeof(OtherReal) &&
-      static_cast<const void*>(M.Data()) ==
-      static_cast<const void*>(this->Data())) {
-    // CopyFromMat called on same data.  Nothing to do (except sanity checks).
-    KALDI_ASSERT(Trans == kNoTrans && M.NumRows() == NumRows() &&
-                 M.NumCols() == NumCols() && M.Stride() == Stride());
-    return;
-  }
-  if (Trans == kNoTrans) {
-    KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == M.NumCols());
-    for (MatrixIndexT i = 0; i < num_rows_; i++)
-      (*this).Row(i).CopyFromVec(M.Row(i));
-  } else {
-    KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols());
-    int32 this_stride = stride_, other_stride = M.Stride();
-    Real *this_data = data_;
-    const OtherReal *other_data = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows_; i++)
-      for (MatrixIndexT j = 0; j < num_cols_; j++)
-        this_data[i * this_stride + j] = other_data[j * other_stride + i];
-  }
+    if (sizeof(Real) == sizeof(OtherReal) &&
+        static_cast<const void *>(M.Data()) ==
+            static_cast<const void *>(this->Data())) {
+        // CopyFromMat called on same data.  Nothing to do (except sanity
+        // checks).
+        KALDI_ASSERT(Trans == kNoTrans && M.NumRows() == NumRows() &&
+                     M.NumCols() == NumCols() && M.Stride() == Stride());
+        return;
+    }
+    if (Trans == kNoTrans) {
+        KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == M.NumCols());
+        for (MatrixIndexT i = 0; i < num_rows_; i++)
+            (*this).Row(i).CopyFromVec(M.Row(i));
+    } else {
+        KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols());
+        int32 this_stride = stride_, other_stride = M.Stride();
+        Real *this_data = data_;
+        const OtherReal *other_data = M.Data();
+        for (MatrixIndexT i = 0; i < num_rows_; i++)
+            for (MatrixIndexT j = 0; j < num_cols_; j++)
+                this_data[i * this_stride + j] =
+                    other_data[j * other_stride + i];
+    }
 }
 
 // template instantiations.
-template
-void MatrixBase<float>::CopyFromMat(const MatrixBase<double> & M,
-                                    MatrixTransposeType Trans);
-template
-void MatrixBase<double>::CopyFromMat(const MatrixBase<float> & M,
-                                     MatrixTransposeType Trans);
-template
-void MatrixBase<float>::CopyFromMat(const MatrixBase<float> & M,
-                                    MatrixTransposeType Trans);
-template
-void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
-                                     MatrixTransposeType Trans);
+template void MatrixBase<float>::CopyFromMat(const MatrixBase<double> &M,
+                                             MatrixTransposeType Trans);
+template void MatrixBase<double>::CopyFromMat(const MatrixBase<float> &M,
+                                              MatrixTransposeType Trans);
+template void MatrixBase<float>::CopyFromMat(const MatrixBase<float> &M,
+                                             MatrixTransposeType Trans);
+template void MatrixBase<double>::CopyFromMat(const MatrixBase<double> &M,
+                                              MatrixTransposeType Trans);
 
 /*
 // Specialize the template for CopyFromSp for float, float.
@@ -987,99 +1015,97 @@ void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
                                     MatrixTransposeType trans);
 
 */
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
-  if (rv.Dim() == num_rows_*num_cols_) {
-    if (stride_ == num_cols_) {
-      // one big copy operation.
-      const Real *rv_data = rv.Data();
-      std::memcpy(data_, rv_data, sizeof(Real)*num_rows_*num_cols_);
-    } else {
-      const Real *rv_data = rv.Data();
-      for (MatrixIndexT r = 0; r < num_rows_; r++) {
-        Real *row_data = RowData(r);
-        for (MatrixIndexT c = 0; c < num_cols_; c++) {
-          row_data[c] = rv_data[c];
+    if (rv.Dim() == num_rows_ * num_cols_) {
+        if (stride_ == num_cols_) {
+            // one big copy operation.
+            const Real *rv_data = rv.Data();
+            std::memcpy(data_, rv_data, sizeof(Real) * num_rows_ * num_cols_);
+        } else {
+            const Real *rv_data = rv.Data();
+            for (MatrixIndexT r = 0; r < num_rows_; r++) {
+                Real *row_data = RowData(r);
+                for (MatrixIndexT c = 0; c < num_cols_; c++) {
+                    row_data[c] = rv_data[c];
+                }
+                rv_data += num_cols_;
+            }
         }
-        rv_data += num_cols_;
-      }
+    } else if (rv.Dim() == num_cols_) {
+        const Real *rv_data = rv.Data();
+        for (MatrixIndexT r = 0; r < num_rows_; r++)
+            std::memcpy(RowData(r), rv_data, sizeof(Real) * num_cols_);
+    } else {
+        KALDI_ERR << "Wrong sized arguments";
     }
-  } else if (rv.Dim() == num_cols_) {
-    const Real *rv_data = rv.Data();
-    for (MatrixIndexT r = 0; r < num_rows_; r++)
-      std::memcpy(RowData(r), rv_data, sizeof(Real)*num_cols_);
-  } else {
-    KALDI_ERR << "Wrong sized arguments";
-  }
 }
 
-template<typename Real>
-template<typename OtherReal>
+template <typename Real>
+template <typename OtherReal>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<OtherReal> &rv) {
-  if (rv.Dim() == num_rows_*num_cols_) {
-    const OtherReal *rv_data = rv.Data();
-    for (MatrixIndexT r = 0; r < num_rows_; r++) {
-      Real *row_data = RowData(r);
-      for (MatrixIndexT c = 0; c < num_cols_; c++) {
-        row_data[c] = static_cast<Real>(rv_data[c]);
-      }
-      rv_data += num_cols_;
+    if (rv.Dim() == num_rows_ * num_cols_) {
+        const OtherReal *rv_data = rv.Data();
+        for (MatrixIndexT r = 0; r < num_rows_; r++) {
+            Real *row_data = RowData(r);
+            for (MatrixIndexT c = 0; c < num_cols_; c++) {
+                row_data[c] = static_cast<Real>(rv_data[c]);
+            }
+            rv_data += num_cols_;
+        }
+    } else if (rv.Dim() == num_cols_) {
+        const OtherReal *rv_data = rv.Data();
+        Real *first_row_data = RowData(0);
+        for (MatrixIndexT c = 0; c < num_cols_; c++)
+            first_row_data[c] = rv_data[c];
+        for (MatrixIndexT r = 1; r < num_rows_; r++)
+            std::memcpy(RowData(r), first_row_data, sizeof(Real) * num_cols_);
+    } else {
+        KALDI_ERR << "Wrong sized arguments.";
     }
-  } else if (rv.Dim() == num_cols_) {
-    const OtherReal *rv_data = rv.Data();
-    Real *first_row_data = RowData(0);
-    for (MatrixIndexT c = 0; c < num_cols_; c++)
-      first_row_data[c] = rv_data[c];
-    for (MatrixIndexT r = 1; r < num_rows_; r++)
-      std::memcpy(RowData(r), first_row_data, sizeof(Real)*num_cols_);
-  } else {
-    KALDI_ERR << "Wrong sized arguments.";
-  }
 }
 
 
-template
-void MatrixBase<float>::CopyRowsFromVec(const VectorBase<double> &rv);
-template
-void MatrixBase<double>::CopyRowsFromVec(const VectorBase<float> &rv);
+template void MatrixBase<float>::CopyRowsFromVec(const VectorBase<double> &rv);
+template void MatrixBase<double>::CopyRowsFromVec(const VectorBase<float> &rv);
 
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
-  if (rv.Dim() == num_rows_*num_cols_) {
-    const Real *v_inc_data = rv.Data();
-    Real *m_inc_data = data_;
+    if (rv.Dim() == num_rows_ * num_cols_) {
+        const Real *v_inc_data = rv.Data();
+        Real *m_inc_data = data_;
 
-    for (MatrixIndexT c = 0; c < num_cols_; c++) {
-      for (MatrixIndexT r = 0; r < num_rows_; r++) {
-        m_inc_data[r * stride_] = v_inc_data[r];
-      }
-      v_inc_data += num_rows_;
-      m_inc_data ++;
-    }
-  } else if (rv.Dim() == num_rows_) {
-    const Real *v_inc_data = rv.Data();
-    Real *m_inc_data = data_;
-    for (MatrixIndexT r = 0; r < num_rows_; r++) {
-      Real value = *(v_inc_data++);
-      for (MatrixIndexT c = 0; c < num_cols_; c++)
-        m_inc_data[c] = value;
-      m_inc_data += stride_;
+        for (MatrixIndexT c = 0; c < num_cols_; c++) {
+            for (MatrixIndexT r = 0; r < num_rows_; r++) {
+                m_inc_data[r * stride_] = v_inc_data[r];
+            }
+            v_inc_data += num_rows_;
+            m_inc_data++;
+        }
+    } else if (rv.Dim() == num_rows_) {
+        const Real *v_inc_data = rv.Data();
+        Real *m_inc_data = data_;
+        for (MatrixIndexT r = 0; r < num_rows_; r++) {
+            Real value = *(v_inc_data++);
+            for (MatrixIndexT c = 0; c < num_cols_; c++) m_inc_data[c] = value;
+            m_inc_data += stride_;
+        }
+    } else {
+        KALDI_ERR << "Wrong size of arguments.";
     }
-  } else {
-    KALDI_ERR << "Wrong size of arguments.";
-  }
 }
 
-template<typename Real>
-void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIndexT row) {
-  KALDI_ASSERT(rv.Dim() == num_cols_ &&
-               static_cast<UnsignedMatrixIndexT>(row) <
-               static_cast<UnsignedMatrixIndexT>(num_rows_));
+template <typename Real>
+void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv,
+                                      const MatrixIndexT row) {
+    KALDI_ASSERT(rv.Dim() == num_cols_ &&
+                 static_cast<UnsignedMatrixIndexT>(row) <
+                     static_cast<UnsignedMatrixIndexT>(num_rows_));
 
-  const Real *rv_data = rv.Data();
-  Real *row_data = RowData(row);
+    const Real *rv_data = rv.Data();
+    Real *row_data = RowData(row);
 
-  std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real));
+    std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real));
 }
 /*
 template<typename Real>
@@ -1091,40 +1117,40 @@ void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
     *my_data = *rv_data;
 }*/
 
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::CopyColFromVec(const VectorBase<Real> &rv,
                                       const MatrixIndexT col) {
-  KALDI_ASSERT(rv.Dim() == num_rows_ &&
-               static_cast<UnsignedMatrixIndexT>(col) <
-               static_cast<UnsignedMatrixIndexT>(num_cols_));
+    KALDI_ASSERT(rv.Dim() == num_rows_ &&
+                 static_cast<UnsignedMatrixIndexT>(col) <
+                     static_cast<UnsignedMatrixIndexT>(num_cols_));
 
-  const Real *rv_data = rv.Data();
-  Real *col_data = data_ + col;
+    const Real *rv_data = rv.Data();
+    Real *col_data = data_ + col;
 
-  for (MatrixIndexT r = 0; r < num_rows_; r++)
-    col_data[r * stride_] = rv_data[r];
+    for (MatrixIndexT r = 0; r < num_rows_; r++)
+        col_data[r * stride_] = rv_data[r];
 }
 
 
-
-template<typename Real>
+template <typename Real>
 void Matrix<Real>::RemoveRow(MatrixIndexT i) {
-  KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-               static_cast<UnsignedMatrixIndexT>(MatrixBase<Real>::num_rows_)
-               && "Access out of matrix");
-  for (MatrixIndexT j = i + 1; j <  MatrixBase<Real>::num_rows_; j++)
-    MatrixBase<Real>::Row(j-1).CopyFromVec( MatrixBase<Real>::Row(j));
-  MatrixBase<Real>::num_rows_--;
+    KALDI_ASSERT(
+        static_cast<UnsignedMatrixIndexT>(i) <
+            static_cast<UnsignedMatrixIndexT>(MatrixBase<Real>::num_rows_) &&
+        "Access out of matrix");
+    for (MatrixIndexT j = i + 1; j < MatrixBase<Real>::num_rows_; j++)
+        MatrixBase<Real>::Row(j - 1).CopyFromVec(MatrixBase<Real>::Row(j));
+    MatrixBase<Real>::num_rows_--;
 }
 
-template<typename Real>
+template <typename Real>
 void Matrix<Real>::Destroy() {
-  // we need to free the data block if it was defined
-  if (NULL != MatrixBase<Real>::data_)
-    KALDI_MEMALIGN_FREE( MatrixBase<Real>::data_);
-  MatrixBase<Real>::data_ = NULL;
-  MatrixBase<Real>::num_rows_ = MatrixBase<Real>::num_cols_
-      = MatrixBase<Real>::stride_ = 0;
+    // we need to free the data block if it was defined
+    if (NULL != MatrixBase<Real>::data_)
+        KALDI_MEMALIGN_FREE(MatrixBase<Real>::data_);
+    MatrixBase<Real>::data_ = NULL;
+    MatrixBase<Real>::num_rows_ = MatrixBase<Real>::num_cols_ =
+        MatrixBase<Real>::stride_ = 0;
 }
 
 
@@ -1248,7 +1274,8 @@ template<typename Real>
 void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
                                        const MatrixBase<Real> &output,
                                        Real power) {
-  KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows());
+  KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() ==
+this->NumRows());
   KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
                this->NumRows() == output.NumRows());
 
@@ -1320,22 +1347,22 @@ void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
 }
 */
 
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::SetZero() {
-  if (num_cols_ == stride_)
-    memset(data_, 0, sizeof(Real)*num_rows_*num_cols_);
-  else
-    for (MatrixIndexT row = 0; row < num_rows_; row++)
-      memset(data_ + row*stride_, 0, sizeof(Real)*num_cols_);
+    if (num_cols_ == stride_)
+        memset(data_, 0, sizeof(Real) * num_rows_ * num_cols_);
+    else
+        for (MatrixIndexT row = 0; row < num_rows_; row++)
+            memset(data_ + row * stride_, 0, sizeof(Real) * num_cols_);
 }
 
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::Set(Real value) {
-  for (MatrixIndexT row = 0; row < num_rows_; row++) {
-    for (MatrixIndexT col = 0; col < num_cols_; col++) {
-      (*this)(row, col) = value;
+    for (MatrixIndexT row = 0; row < num_rows_; row++) {
+        for (MatrixIndexT col = 0; col < num_cols_; col++) {
+            (*this)(row, col) = value;
+        }
     }
-  }
 }
 
 /*
@@ -1355,7 +1382,8 @@ void MatrixBase<Real>::SetRandn() {
     for (MatrixIndexT col = 0; col < nc; col += 2) {
       kaldi::RandGauss2(row_data + col, row_data + col + 1, &rstate);
     }
-    if (nc != num_cols_) row_data[nc] = static_cast<Real>(kaldi::RandGauss(&rstate));
+    if (nc != num_cols_) row_data[nc] =
+static_cast<Real>(kaldi::RandGauss(&rstate));
   }
 }
 
@@ -1371,273 +1399,302 @@ void MatrixBase<Real>::SetRandUniform() {
 }
 */
 
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
-  if (!os.good()) {
-    KALDI_ERR << "Failed to write matrix to stream: stream not good";
-  }
-  if (binary) {  // Use separate binary and text formats,
-    // since in binary mode we need to know if it's float or double.
-    std::string my_token = (sizeof(Real) == 4 ? "FM" : "DM");
-
-    WriteToken(os, binary, my_token);
-    {
-      int32 rows = this->num_rows_;  // make the size 32-bit on disk.
-      int32 cols = this->num_cols_;
-      KALDI_ASSERT(this->num_rows_ == (MatrixIndexT) rows);
-      KALDI_ASSERT(this->num_cols_ == (MatrixIndexT) cols);
-      WriteBasicType(os, binary, rows);
-      WriteBasicType(os, binary, cols);
-    }
-    if (Stride() == NumCols())
-      os.write(reinterpret_cast<const char*> (Data()), sizeof(Real)
-               * static_cast<size_t>(num_rows_) * static_cast<size_t>(num_cols_));
-    else
-      for (MatrixIndexT i = 0; i < num_rows_; i++)
-        os.write(reinterpret_cast<const char*> (RowData(i)), sizeof(Real)
-                 * num_cols_);
     if (!os.good()) {
-      KALDI_ERR << "Failed to write matrix to stream";
-    }
-  } else {  // text mode.
-    if (num_cols_ == 0) {
-      os << " [ ]\n";
-    } else {
-      os << " [";
-      for (MatrixIndexT i = 0; i < num_rows_; i++) {
-        os << "\n  ";
-        for (MatrixIndexT j = 0; j < num_cols_; j++)
-          os << (*this)(i, j) << " ";
-      }
-      os << "]\n";
+        KALDI_ERR << "Failed to write matrix to stream: stream not good";
+    }
+    if (binary) {  // Use separate binary and text formats,
+        // since in binary mode we need to know if it's float or double.
+        std::string my_token = (sizeof(Real) == 4 ? "FM" : "DM");
+
+        WriteToken(os, binary, my_token);
+        {
+            int32 rows = this->num_rows_;  // make the size 32-bit on disk.
+            int32 cols = this->num_cols_;
+            KALDI_ASSERT(this->num_rows_ == (MatrixIndexT)rows);
+            KALDI_ASSERT(this->num_cols_ == (MatrixIndexT)cols);
+            WriteBasicType(os, binary, rows);
+            WriteBasicType(os, binary, cols);
+        }
+        if (Stride() == NumCols())
+            os.write(reinterpret_cast<const char *>(Data()),
+                     sizeof(Real) * static_cast<size_t>(num_rows_) *
+                         static_cast<size_t>(num_cols_));
+        else
+            for (MatrixIndexT i = 0; i < num_rows_; i++)
+                os.write(reinterpret_cast<const char *>(RowData(i)),
+                         sizeof(Real) * num_cols_);
+        if (!os.good()) {
+            KALDI_ERR << "Failed to write matrix to stream";
+        }
+    } else {  // text mode.
+        if (num_cols_ == 0) {
+            os << " [ ]\n";
+        } else {
+            os << " [";
+            for (MatrixIndexT i = 0; i < num_rows_; i++) {
+                os << "\n  ";
+                for (MatrixIndexT j = 0; j < num_cols_; j++)
+                    os << (*this)(i, j) << " ";
+            }
+            os << "]\n";
+        }
     }
-  }
 }
 
 
-template<typename Real>
-void MatrixBase<Real>::Read(std::istream & is, bool binary) {
-  //  In order to avoid rewriting this, we just declare a Matrix and
-  // use it to read the data, then copy.
-  Matrix<Real> tmp;
-  tmp.Read(is, binary);
-  if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) {
-    KALDI_ERR << "MatrixBase<Real>::Read, size mismatch "
-              << NumRows() << " x " << NumCols() << " versus "
-              << tmp.NumRows() << " x " << tmp.NumCols();
-  }
-  CopyFromMat(tmp);
+template <typename Real>
+void MatrixBase<Real>::Read(std::istream &is, bool binary) {
+    //  In order to avoid rewriting this, we just declare a Matrix and
+    // use it to read the data, then copy.
+    Matrix<Real> tmp;
+    tmp.Read(is, binary);
+    if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) {
+        KALDI_ERR << "MatrixBase<Real>::Read, size mismatch " << NumRows()
+                  << " x " << NumCols() << " versus " << tmp.NumRows() << " x "
+                  << tmp.NumCols();
+    }
+    CopyFromMat(tmp);
 }
 
 
-template<typename Real>
-void Matrix<Real>::Read(std::istream & is, bool binary) {
-  // now assume add == false.
-  MatrixIndexT pos_at_start = is.tellg();
-  std::ostringstream specific_error;
-
-  if (binary) {  // Read in binary mode.
-    int peekval = Peek(is, binary);
-    if (peekval == 'C') {
-      // This code enables us to read CompressedMatrix as a regular matrix.
-      //CompressedMatrix compressed_mat;
-      //compressed_mat.Read(is, binary); // at this point, add == false.
-      //this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
-      //compressed_mat.CopyToMat(this);
-      return;
-    }
-    const char *my_token =  (sizeof(Real) == 4 ? "FM" : "DM");
-    char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
-    if (peekval == other_token_start) {  // need to instantiate the other type to read it.
-      typedef typename OtherReal<Real>::Real OtherType;  // if Real == float, OtherType == double, and vice versa.
-      Matrix<OtherType> other(this->num_rows_, this->num_cols_);
-      other.Read(is, binary);  // add is false at this point anyway.
-      this->Resize(other.NumRows(), other.NumCols());
-      this->CopyFromMat(other);
-      return;
-    }
-    std::string token;
-    ReadToken(is, binary, &token);
-    if (token != my_token) {
-      if (token.length() > 20) token = token.substr(0, 17) + "...";
-      specific_error << ": Expected token " << my_token << ", got " << token;
-      goto bad;
-    }
-    int32 rows, cols;
-    ReadBasicType(is, binary, &rows);  // throws on error.
-    ReadBasicType(is, binary, &cols);  // throws on error.
-    if ((MatrixIndexT)rows != this->num_rows_ || (MatrixIndexT)cols != this->num_cols_) {
-      this->Resize(rows, cols);
-    }
-    if (this->Stride() == this->NumCols() && rows*cols!=0) {
-      is.read(reinterpret_cast<char*>(this->Data()),
-              sizeof(Real)*rows*cols);
-      if (is.fail()) goto bad;
-    } else {
-      for (MatrixIndexT i = 0; i < (MatrixIndexT)rows; i++) {
-        is.read(reinterpret_cast<char*>(this->RowData(i)), sizeof(Real)*cols);
-        if (is.fail()) goto bad;
-      }
-    }
-    if (is.eof()) return;
-    if (is.fail()) goto bad;
-    return;
-  } else {  // Text mode.
-    std::string str;
-    is >> str; // get a token
-    if (is.fail()) { specific_error << ": Expected \"[\", got EOF"; goto bad; }
-    // if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) {  // Back compatibility.
-    // is >> str;  // get #rows
-    //  is >> str;  // get #cols
-    //  is >> str;  // get "["
-    // }
-    if (str == "[]") { Resize(0, 0); return; } // Be tolerant of variants.
-    else if (str != "[") {
-      if (str.length() > 20) str = str.substr(0, 17) + "...";
-      specific_error << ": Expected \"[\", got \"" << str << '"';
-      goto bad;
-    }
-    // At this point, we have read "[".
-    std::vector<std::vector<Real>* > data;
-    std::vector<Real> *cur_row = new std::vector<Real>;
-    while (1) {
-      int i = is.peek();
-      if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto cleanup; }
-      else if (static_cast<char>(i) == ']') {  // Finished reading matrix.
-        is.get();  // eat the "]".
-        i = is.peek();
-        if (static_cast<char>(i) == '\r') {
-          is.get();
-          is.get();  // get \r\n (must eat what we wrote)
-        } else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
-        if (is.fail()) {
-          KALDI_WARN << "After end of matrix data, read error.";
-          // we got the data we needed, so just warn for this error.
+template <typename Real>
+void Matrix<Real>::Read(std::istream &is, bool binary) {
+    // now assume add == false.
+    MatrixIndexT pos_at_start = is.tellg();
+    std::ostringstream specific_error;
+
+    if (binary) {  // Read in binary mode.
+        int peekval = Peek(is, binary);
+        if (peekval == 'C') {
+            // This code enables us to read CompressedMatrix as a regular
+            // matrix.
+            // CompressedMatrix compressed_mat;
+            // compressed_mat.Read(is, binary); // at this point, add == false.
+            // this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
+            // compressed_mat.CopyToMat(this);
+            return;
         }
-        // Now process the data.
-        if (!cur_row->empty()) data.push_back(cur_row);
-        else delete(cur_row);
-        cur_row = NULL;
-        if (data.empty()) { this->Resize(0, 0); return; }
-        else {
-          int32 num_rows = data.size(), num_cols = data[0]->size();
-          this->Resize(num_rows, num_cols);
-          for (int32 i = 0; i < num_rows; i++) {
-            if (static_cast<int32>(data[i]->size()) != num_cols) {
-              specific_error << "Matrix has inconsistent #cols: " << num_cols
-                             << " vs." << data[i]->size() << " (processing row"
-                             << i << ")";
-              goto cleanup;
+        const char *my_token = (sizeof(Real) == 4 ? "FM" : "DM");
+        char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
+        if (peekval == other_token_start) {  // need to instantiate the other
+                                             // type to read it.
+            typedef typename OtherReal<Real>::Real OtherType;  // if Real ==
+                                                               // float,
+                                                               // OtherType ==
+                                                               // double, and
+                                                               // vice versa.
+            Matrix<OtherType> other(this->num_rows_, this->num_cols_);
+            other.Read(is, binary);  // add is false at this point anyway.
+            this->Resize(other.NumRows(), other.NumCols());
+            this->CopyFromMat(other);
+            return;
+        }
+        std::string token;
+        ReadToken(is, binary, &token);
+        if (token != my_token) {
+            if (token.length() > 20) token = token.substr(0, 17) + "...";
+            specific_error << ": Expected token " << my_token << ", got "
+                           << token;
+            goto bad;
+        }
+        int32 rows, cols;
+        ReadBasicType(is, binary, &rows);  // throws on error.
+        ReadBasicType(is, binary, &cols);  // throws on error.
+        if ((MatrixIndexT)rows != this->num_rows_ ||
+            (MatrixIndexT)cols != this->num_cols_) {
+            this->Resize(rows, cols);
+        }
+        if (this->Stride() == this->NumCols() && rows * cols != 0) {
+            is.read(reinterpret_cast<char *>(this->Data()),
+                    sizeof(Real) * rows * cols);
+            if (is.fail()) goto bad;
+        } else {
+            for (MatrixIndexT i = 0; i < (MatrixIndexT)rows; i++) {
+                is.read(reinterpret_cast<char *>(this->RowData(i)),
+                        sizeof(Real) * cols);
+                if (is.fail()) goto bad;
             }
-            for (int32 j = 0; j < num_cols; j++)
-              (*this)(i, j) = (*(data[i]))[j];
-            delete data[i];
-            data[i] = NULL;
-          }
         }
+        if (is.eof()) return;
+        if (is.fail()) goto bad;
         return;
-      } else if (static_cast<char>(i) == '\n' || static_cast<char>(i) == ';') {
-        // End of matrix row.
-        is.get();
-        if (cur_row->size() != 0) {
-          data.push_back(cur_row);
-          cur_row = new std::vector<Real>;
-          cur_row->reserve(data.back()->size());
-        }
-      } else if ( (i >= '0' && i <= '9') || i == '-' ) {  // A number...
-        Real r;
-        is >> r;
+    } else {  // Text mode.
+        std::string str;
+        is >> str;  // get a token
         if (is.fail()) {
-          specific_error << "Stream failure/EOF while reading matrix data.";
-          goto cleanup;
+            specific_error << ": Expected \"[\", got EOF";
+            goto bad;
         }
-        cur_row->push_back(r);
-      } else if (isspace(i)) {
-        is.get();  // eat the space and do nothing.
-      } else {  // NaN or inf or error.
-        std::string str;
-        is >> str;
-        if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
-            !KALDI_STRCASECMP(str.c_str(), "infinity")) {
-          cur_row->push_back(std::numeric_limits<Real>::infinity());
-          KALDI_WARN << "Reading infinite value into matrix.";
-        } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
-          cur_row->push_back(std::numeric_limits<Real>::quiet_NaN());
-          KALDI_WARN << "Reading NaN value into matrix.";
-        } else {
-          if (str.length() > 20) str = str.substr(0, 17) + "...";
-          specific_error << "Expecting numeric matrix data, got " << str;
-          goto cleanup;
+        // if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) {  // Back
+        // compatibility.
+        // is >> str;  // get #rows
+        //  is >> str;  // get #cols
+        //  is >> str;  // get "["
+        // }
+        if (str == "[]") {
+            Resize(0, 0);
+            return;
+        }  // Be tolerant of variants.
+        else if (str != "[") {
+            if (str.length() > 20) str = str.substr(0, 17) + "...";
+            specific_error << ": Expected \"[\", got \"" << str << '"';
+            goto bad;
+        }
+        // At this point, we have read "[".
+        std::vector<std::vector<Real> *> data;
+        std::vector<Real> *cur_row = new std::vector<Real>;
+        while (1) {
+            int i = is.peek();
+            if (i == -1) {
+                specific_error << "Got EOF while reading matrix data";
+                goto cleanup;
+            } else if (static_cast<char>(i) ==
+                       ']') {  // Finished reading matrix.
+                is.get();      // eat the "]".
+                i = is.peek();
+                if (static_cast<char>(i) == '\r') {
+                    is.get();
+                    is.get();  // get \r\n (must eat what we wrote)
+                } else if (static_cast<char>(i) == '\n') {
+                    is.get();
+                }  // get \n (must eat what we wrote)
+                if (is.fail()) {
+                    KALDI_WARN << "After end of matrix data, read error.";
+                    // we got the data we needed, so just warn for this error.
+                }
+                // Now process the data.
+                if (!cur_row->empty())
+                    data.push_back(cur_row);
+                else
+                    delete (cur_row);
+                cur_row = NULL;
+                if (data.empty()) {
+                    this->Resize(0, 0);
+                    return;
+                } else {
+                    int32 num_rows = data.size(), num_cols = data[0]->size();
+                    this->Resize(num_rows, num_cols);
+                    for (int32 i = 0; i < num_rows; i++) {
+                        if (static_cast<int32>(data[i]->size()) != num_cols) {
+                            specific_error
+                                << "Matrix has inconsistent #cols: " << num_cols
+                                << " vs." << data[i]->size()
+                                << " (processing row" << i << ")";
+                            goto cleanup;
+                        }
+                        for (int32 j = 0; j < num_cols; j++)
+                            (*this)(i, j) = (*(data[i]))[j];
+                        delete data[i];
+                        data[i] = NULL;
+                    }
+                }
+                return;
+            } else if (static_cast<char>(i) == '\n' ||
+                       static_cast<char>(i) == ';') {
+                // End of matrix row.
+                is.get();
+                if (cur_row->size() != 0) {
+                    data.push_back(cur_row);
+                    cur_row = new std::vector<Real>;
+                    cur_row->reserve(data.back()->size());
+                }
+            } else if ((i >= '0' && i <= '9') || i == '-') {  // A number...
+                Real r;
+                is >> r;
+                if (is.fail()) {
+                    specific_error
+                        << "Stream failure/EOF while reading matrix data.";
+                    goto cleanup;
+                }
+                cur_row->push_back(r);
+            } else if (isspace(i)) {
+                is.get();  // eat the space and do nothing.
+            } else {       // NaN or inf or error.
+                std::string str;
+                is >> str;
+                if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
+                    !KALDI_STRCASECMP(str.c_str(), "infinity")) {
+                    cur_row->push_back(std::numeric_limits<Real>::infinity());
+                    KALDI_WARN << "Reading infinite value into matrix.";
+                } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
+                    cur_row->push_back(std::numeric_limits<Real>::quiet_NaN());
+                    KALDI_WARN << "Reading NaN value into matrix.";
+                } else {
+                    if (str.length() > 20) str = str.substr(0, 17) + "...";
+                    specific_error << "Expecting numeric matrix data, got "
+                                   << str;
+                    goto cleanup;
+                }
+            }
         }
-      }
-    }
     // Note, we never leave the while () loop before this
     // line (we return from it.)
- cleanup: // We only reach here in case of error in the while loop above.
-    if(cur_row != NULL)
-      delete cur_row;
-    for (size_t i = 0; i < data.size(); i++)
-      if(data[i] != NULL)
-        delete data[i];
-    // and then go on to "bad" below, where we print error.
-  }
+    cleanup:  // We only reach here in case of error in the while loop above.
+        if (cur_row != NULL) delete cur_row;
+        for (size_t i = 0; i < data.size(); i++)
+            if (data[i] != NULL) delete data[i];
+        // and then go on to "bad" below, where we print error.
+    }
 bad:
-  KALDI_ERR << "Failed to read matrix from stream.  " << specific_error.str()
-            << " File position at start is "
-            << pos_at_start << ", currently " << is.tellg();
+    KALDI_ERR << "Failed to read matrix from stream.  " << specific_error.str()
+              << " File position at start is " << pos_at_start << ", currently "
+              << is.tellg();
 }
 
 
 // Constructor... note that this is not const-safe as it would
 // be quite complicated to implement a "const SubMatrix" class that
 // would not allow its contents to be changed.
-template<typename Real>
+template <typename Real>
 SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
                            const MatrixIndexT ro,
                            const MatrixIndexT r,
                            const MatrixIndexT co,
                            const MatrixIndexT c) {
-  if (r == 0 || c == 0) {
-    // we support the empty sub-matrix as a special case.
-    KALDI_ASSERT(c == 0 && r == 0);
-    this->data_ = NULL;
-    this->num_cols_ = 0;
-    this->num_rows_ = 0;
-    this->stride_ = 0;
-    return;
-  }
-  KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(ro) <
-               static_cast<UnsignedMatrixIndexT>(M.num_rows_) &&
-               static_cast<UnsignedMatrixIndexT>(co) <
-               static_cast<UnsignedMatrixIndexT>(M.num_cols_) &&
-               static_cast<UnsignedMatrixIndexT>(r) <=
-               static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
-               static_cast<UnsignedMatrixIndexT>(c) <=
-               static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
-  // point to the begining of window
-  MatrixBase<Real>::num_rows_ = r;
-  MatrixBase<Real>::num_cols_ = c;
-  MatrixBase<Real>::stride_ = M.Stride();
-  MatrixBase<Real>::data_ = M.Data_workaround() +
-      static_cast<size_t>(co) +
-      static_cast<size_t>(ro) * static_cast<size_t>(M.Stride());
+    if (r == 0 || c == 0) {
+        // we support the empty sub-matrix as a special case.
+        KALDI_ASSERT(c == 0 && r == 0);
+        this->data_ = NULL;
+        this->num_cols_ = 0;
+        this->num_rows_ = 0;
+        this->stride_ = 0;
+        return;
+    }
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(ro) <
+                     static_cast<UnsignedMatrixIndexT>(M.num_rows_) &&
+                 static_cast<UnsignedMatrixIndexT>(co) <
+                     static_cast<UnsignedMatrixIndexT>(M.num_cols_) &&
+                 static_cast<UnsignedMatrixIndexT>(r) <=
+                     static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
+                 static_cast<UnsignedMatrixIndexT>(c) <=
+                     static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
+    // point to the begining of window
+    MatrixBase<Real>::num_rows_ = r;
+    MatrixBase<Real>::num_cols_ = c;
+    MatrixBase<Real>::stride_ = M.Stride();
+    MatrixBase<Real>::data_ =
+        M.Data_workaround() + static_cast<size_t>(co) +
+        static_cast<size_t>(ro) * static_cast<size_t>(M.Stride());
 }
 
 
-template<typename Real>
+template <typename Real>
 SubMatrix<Real>::SubMatrix(Real *data,
                            MatrixIndexT num_rows,
                            MatrixIndexT num_cols,
-                           MatrixIndexT stride):
-    MatrixBase<Real>(data, num_cols, num_rows, stride) { // caution: reversed order!
-  if (data == NULL) {
-    KALDI_ASSERT(num_rows * num_cols == 0);
-    this->num_rows_ = 0;
-    this->num_cols_ = 0;
-    this->stride_ = 0;
-  } else {
-    KALDI_ASSERT(this->stride_ >= this->num_cols_);
-  }
+                           MatrixIndexT stride)
+    : MatrixBase<Real>(
+          data, num_cols, num_rows, stride) {  // caution: reversed order!
+    if (data == NULL) {
+        KALDI_ASSERT(num_rows * num_cols == 0);
+        this->num_rows_ = 0;
+        this->num_cols_ = 0;
+        this->stride_ = 0;
+    } else {
+        KALDI_ASSERT(this->stride_ >= this->num_cols_);
+    }
 }
 
 /*
@@ -1665,9 +1722,11 @@ Real MatrixBase<Real>::Cond() const {
   KALDI_ASSERT(num_rows_ > 0&&num_cols_ > 0);
   Vector<Real> singular_values(std::min(num_rows_, num_cols_));
   Svd(&singular_values);  // Get singular values...
-  Real min = singular_values(0), max = singular_values(0);  // both absolute values...
+  Real min = singular_values(0), max = singular_values(0);  // both absolute
+values...
   for (MatrixIndexT i = 1;i < singular_values.Dim();i++) {
-    min = std::min((Real)std::abs(singular_values(i)), min); max = std::max((Real)std::abs(singular_values(i)), max);
+    min = std::min((Real)std::abs(singular_values(i)), min); max =
+std::max((Real)std::abs(singular_values(i)), max);
   }
   if (min > 0) return max/min;
   else return std::numeric_limits<Real>::infinity();
@@ -1677,7 +1736,8 @@ template<typename Real>
 Real MatrixBase<Real>::Trace(bool check_square) const  {
   KALDI_ASSERT(!check_square || num_rows_ == num_cols_);
   Real ans = 0.0;
-  for (MatrixIndexT r = 0;r < std::min(num_rows_, num_cols_);r++) ans += data_ [r + stride_*r];
+  for (MatrixIndexT r = 0;r < std::min(num_rows_, num_cols_);r++) ans += data_
+[r + stride_*r];
   return ans;
 }
 
@@ -1707,22 +1767,29 @@ Real MatrixBase<Real>::Min() const {
 
 template <typename Real>
 void MatrixBase<Real>::AddMatMatMat(Real alpha,
-                                    const MatrixBase<Real> &A, MatrixTransposeType transA,
-                                    const MatrixBase<Real> &B, MatrixTransposeType transB,
-                                    const MatrixBase<Real> &C, MatrixTransposeType transC,
+                                    const MatrixBase<Real> &A,
+MatrixTransposeType transA,
+                                    const MatrixBase<Real> &B,
+MatrixTransposeType transB,
+                                    const MatrixBase<Real> &C,
+MatrixTransposeType transC,
                                     Real beta) {
-  // Note on time taken with different orders of computation.  Assume not transposed in this /
-  // discussion. Firstly, normalize expressions using A.NumCols == B.NumRows and B.NumCols == C.NumRows, prefer
+  // Note on time taken with different orders of computation.  Assume not
+transposed in this /
+  // discussion. Firstly, normalize expressions using A.NumCols == B.NumRows and
+B.NumCols == C.NumRows, prefer
   // rows where there is a choice.
   // time taken for (AB) is:  A.NumRows*B.NumRows*C.Rows
   // time taken for (AB)C is A.NumRows*C.NumRows*C.Cols
-  // so this order is A.NumRows*B.NumRows*C.NumRows + A.NumRows*C.NumRows*C.NumCols.
+  // so this order is A.NumRows*B.NumRows*C.NumRows +
+A.NumRows*C.NumRows*C.NumCols.
 
   // time taken for (BC) is: B.NumRows*C.NumRows*C.Cols
   // time taken for A(BC) is: A.NumRows*B.NumRows*C.Cols
   // so this order is B.NumRows*C.NumRows*C.NumCols + A.NumRows*B.NumRows*C.Cols
 
-  MatrixIndexT ARows = A.num_rows_, ACols = A.num_cols_, BRows = B.num_rows_, BCols = B.num_cols_,
+  MatrixIndexT ARows = A.num_rows_, ACols = A.num_cols_, BRows = B.num_rows_,
+BCols = B.num_cols_,
       CRows = C.num_rows_, CCols = C.num_cols_;
   if (transA == kTrans) std::swap(ARows, ACols);
   if (transB == kTrans) std::swap(BRows, BCols);
@@ -1746,58 +1813,71 @@ void MatrixBase<Real>::AddMatMatMat(Real alpha,
 
 
 template<typename Real>
-void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) {
+void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+MatrixBase<Real> *Vt) {
   // Svd, *this = U*diag(s)*Vt.
   // With (*this).num_rows_ == m, (*this).num_cols_ == n,
-  // Support only skinny Svd with m>=n (NumRows>=NumCols), and zero sizes for U and Vt mean
+  // Support only skinny Svd with m>=n (NumRows>=NumCols), and zero sizes for U
+and Vt mean
   // we do not want that output.  We expect that s.Dim() == m,
   // U is either 0 by 0 or m by n, and rv is either 0 by 0 or n by n.
   // Throws exception on error.
 
-  KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols.");  // For compatibility with JAMA code.
+  KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols.");
+// For compatibility with JAMA code.
   KALDI_ASSERT(s->Dim() == num_cols_);  // s should be the smaller dim.
-  KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_));
-  KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_));
+  KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ ==
+num_cols_));
+  KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ ==
+num_cols_));
 
   Real prescale = 1.0;
-  if ( std::abs((*this)(0, 0) ) < 1.0e-30) {  // Very tiny value... can cause problems in Svd.
+  if ( std::abs((*this)(0, 0) ) < 1.0e-30) {  // Very tiny value... can cause
+problems in Svd.
     Real max_elem = LargestAbsElem();
     if (max_elem != 0) {
       prescale = 1.0 / max_elem;
-      if (std::abs(prescale) == std::numeric_limits<Real>::infinity()) { prescale = 1.0e+40; }
+      if (std::abs(prescale) == std::numeric_limits<Real>::infinity()) {
+prescale = 1.0e+40; }
       (*this).Scale(prescale);
     }
   }
 
 #if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
-  // "S" == skinny Svd (only one we support because of compatibility with Jama one which is only skinny),
+  // "S" == skinny Svd (only one we support because of compatibility with Jama
+one which is only skinny),
   // "N"== no eigenvectors wanted.
   LapackGesvd(s, U, Vt);
 #else
   /*  if (num_rows_ > 1 && num_cols_ > 1 && (*this)(0, 0) == (*this)(1, 1)
-      && Max() == Min() && (*this)(0, 0) != 0.0) { // special case that JamaSvd sometimes crashes on.
-      KALDI_WARN << "Jama SVD crashes on this type of matrix, perturbing it to prevent crash.";
+      && Max() == Min() && (*this)(0, 0) != 0.0) { // special case that JamaSvd
+sometimes crashes on.
+      KALDI_WARN << "Jama SVD crashes on this type of matrix, perturbing it to
+prevent crash.";
       for(int32 i = 0; i < NumRows(); i++)
       (*this)(i, i)  *= 1.00001;
       }*/
 //  bool ans = JamaSvd(s, U, Vt);
-  //if (Vt != NULL) Vt->Transpose();  // possibly to do: change this and also the transpose inside the JamaSvd routine.  note, Vt is square.
-  //if (!ans) {
-    //KALDI_ERR << "Error doing Svd";  // This one will be caught.
-  //}
+// if (Vt != NULL) Vt->Transpose();  // possibly to do: change this and also the
+// transpose inside the JamaSvd routine.  note, Vt is square.
+// if (!ans) {
+// KALDI_ERR << "Error doing Svd";  // This one will be caught.
+//}
 //#endif
-  //if (prescale != 1.0) s->Scale(1.0/prescale);
+// if (prescale != 1.0) s->Scale(1.0/prescale);
 //}
 /*
 template<typename Real>
-void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
+void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
+MatrixBase<Real> *Vt) const {
   try {
     if (num_rows_ >= num_cols_) {
       Matrix<Real> tmp(*this);
       tmp.DestructiveSvd(s, U, Vt);
     } else {
       Matrix<Real> tmp(*this, kTrans);  // transpose of *this.
-      // rVt will have different dim so cannot transpose in-place --> use a temp matrix.
+      // rVt will have different dim so cannot transpose in-place --> use a temp
+matrix.
       Matrix<Real> Vt_Trans(Vt ? Vt->num_cols_ : 0, Vt ? Vt->num_rows_ : 0);
       // U will be transpose
       tmp.DestructiveSvd(s, Vt ? &Vt_Trans : NULL, U);
@@ -1806,7 +1886,8 @@ void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<
     }
   } catch (...) {
     KALDI_ERR << "Error doing Svd (did not converge), first part of matrix is\n"
-              << SubMatrix<Real>(*this, 0, std::min((MatrixIndexT)10, num_rows_),
+              << SubMatrix<Real>(*this, 0, std::min((MatrixIndexT)10,
+num_rows_),
                                  0, std::min((MatrixIndexT)10, num_cols_))
               << ", min and max are: " << Min() << ", " << Max();
   }
@@ -1819,7 +1900,8 @@ bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
   Real bad_sum = 0.0, good_sum = 0.0;
   for (MatrixIndexT i = 0;i < R;i++) {
     for (MatrixIndexT j = 0;j < i;j++) {
-      Real a = (*this)(i, j), b = (*this)(j, i), avg = 0.5*(a+b), diff = 0.5*(a-b);
+      Real a = (*this)(i, j), b = (*this)(j, i), avg = 0.5*(a+b), diff =
+0.5*(a-b);
       good_sum += std::abs(avg); bad_sum += std::abs(diff);
     }
     good_sum += std::abs((*this)(i, i));
@@ -1860,7 +1942,8 @@ bool MatrixBase<Real>::IsUnit(Real cutoff) const {
   Real bad_max = 0.0;
   for (MatrixIndexT i = 0; i < R;i++)
     for (MatrixIndexT j = 0; j < C;j++)
-      bad_max = std::max(bad_max, static_cast<Real>(std::abs( (*this)(i, j) - (i == j?1.0:0.0))));
+      bad_max = std::max(bad_max, static_cast<Real>(std::abs( (*this)(i, j) - (i
+== j?1.0:0.0))));
   return (bad_max <= cutoff);
 }
 
@@ -1880,7 +1963,8 @@ Real MatrixBase<Real>::FrobeniusNorm() const{
 }
 
 template<typename Real>
-bool MatrixBase<Real>::ApproxEqual(const MatrixBase<Real> &other, float tol) const {
+bool MatrixBase<Real>::ApproxEqual(const MatrixBase<Real> &other, float tol)
+const {
   if (num_rows_ != other.num_rows_ || num_cols_ != other.num_cols_)
     KALDI_ERR << "ApproxEqual: size mismatch.";
   Matrix<Real> tmp(*this);
@@ -1953,27 +2037,35 @@ void MatrixBase<Real>::OrthogonalizeRows() {
 }
 
 
-// Uses Svd to compute the eigenvalue decomposition of a symmetric positive semidefinite
+// Uses Svd to compute the eigenvalue decomposition of a symmetric positive
+semidefinite
 //   matrix:
-// (*this) = rU * diag(rs) * rU^T, with rU an orthogonal matrix so rU^{-1} = rU^T.
-// Does this by computing svd (*this) = U diag(rs) V^T ... answer is just U diag(rs) U^T.
-// Throws exception if this failed to within supplied precision (typically because *this was not
+// (*this) = rU * diag(rs) * rU^T, with rU an orthogonal matrix so rU^{-1} =
+rU^T.
+// Does this by computing svd (*this) = U diag(rs) V^T ... answer is just U
+diag(rs) U^T.
+// Throws exception if this failed to within supplied precision (typically
+because *this was not
 // symmetric positive definite).
 
 template<typename Real>
-void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
+void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real>
+*rU, Real check_thresh) // e.g. check_thresh = 0.001
 {
   const MatrixIndexT D = num_rows_;
 
   KALDI_ASSERT(num_rows_ == num_cols_);
-  KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical.");
+  KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be
+symmetrical.");
   KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D);
 
   Matrix<Real>  Vt(D, D);
   Svd(rs, rU, &Vt);
 
-  // First just zero any singular values if the column of U and V do not have +ve dot product--
-  // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct.
+  // First just zero any singular values if the column of U and V do not have
++ve dot product--
+  // this may mean we have small negative eigenvalues, and if we zero them the
+result will be closer to correct.
   for (MatrixIndexT i = 0;i < D;i++) {
     Real sum = 0.0;
     for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j);
@@ -1992,9 +2084,12 @@ void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *
 
     if (!(old_norm == 0 && new_norm == 0)) {
       float diff_norm = tmpThisFull.FrobeniusNorm();
-      if (std::abs(new_norm-old_norm) > old_norm*check_thresh || diff_norm > old_norm*check_thresh) {
-        KALDI_WARN << "SymPosSemiDefEig seems to have failed " << diff_norm << " !<< "
-                   << check_thresh << "*" << old_norm << ", maybe matrix was not "
+      if (std::abs(new_norm-old_norm) > old_norm*check_thresh || diff_norm >
+old_norm*check_thresh) {
+        KALDI_WARN << "SymPosSemiDefEig seems to have failed " << diff_norm << "
+!<< "
+                   << check_thresh << "*" << old_norm << ", maybe matrix was not
+"
                    << "positive semi definite.  Continuing anyway.";
       }
     }
@@ -2006,7 +2101,8 @@ template<typename Real>
 Real MatrixBase<Real>::LogDet(Real *det_sign) const {
   Real log_det;
   Matrix<Real> tmp(*this);
-  tmp.Invert(&log_det, det_sign, false);  // false== output not needed (saves some computation).
+  tmp.Invert(&log_det, det_sign, false);  // false== output not needed (saves
+some computation).
   return log_det;
 }
 
@@ -2022,26 +2118,25 @@ void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
 }
 */
 
-//template<class Real>
-//void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
-  //mat.CopyToMat(this);
+// template<class Real>
+// void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
+// mat.CopyToMat(this);
 //}
 
-//template<class Real>
-//Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
-  //Resize(M.NumRows(), M.NumCols(), kUndefined);
-  //M.CopyToMat(this);
+// template<class Real>
+// Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
+// Resize(M.NumRows(), M.NumCols(), kUndefined);
+// M.CopyToMat(this);
 //}
 
 
-
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::InvertElements() {
-  for (MatrixIndexT r = 0; r < num_rows_; r++) {
-    for (MatrixIndexT c = 0; c < num_cols_; c++) {
-      (*this)(r, c) = static_cast<Real>(1.0 / (*this)(r, c));
+    for (MatrixIndexT r = 0; r < num_rows_; r++) {
+        for (MatrixIndexT c = 0; c < num_cols_; c++) {
+            (*this)(r, c) = static_cast<Real>(1.0 / (*this)(r, c));
+        }
     }
-  }
 }
 /*
 template<typename Real>
@@ -2108,7 +2203,8 @@ void MatrixBase<Real>::Pow(const MatrixBase<Real> &src, Real power) {
 }
 
 template<typename Real>
-void MatrixBase<Real>::PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign) {
+void MatrixBase<Real>::PowAbs(const MatrixBase<Real> &src, Real power, bool
+include_sign) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
@@ -2117,9 +2213,9 @@ void MatrixBase<Real>::PowAbs(const MatrixBase<Real> &src, Real power, bool incl
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col ++) {
       if (include_sign == true && src_row_data[col] < 0) {
-	row_data[col] = -pow(std::abs(src_row_data[col]), power);
+    row_data[col] = -pow(std::abs(src_row_data[col]), power);
       } else {
-	row_data[col] = pow(std::abs(src_row_data[col]), power);
+    row_data[col] = pow(std::abs(src_row_data[col]), power);
       }
     }
   }
@@ -2134,7 +2230,8 @@ void MatrixBase<Real>::Floor(const MatrixBase<Real> &src, Real floor_val) {
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
-      row_data[col] = (src_row_data[col] < floor_val ? floor_val : src_row_data[col]);
+      row_data[col] = (src_row_data[col] < floor_val ? floor_val :
+src_row_data[col]);
   }
 }
 
@@ -2147,7 +2244,8 @@ void MatrixBase<Real>::Ceiling(const MatrixBase<Real> &src, Real ceiling_val) {
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
-      row_data[col] = (src_row_data[col] > ceiling_val ? ceiling_val : src_row_data[col]);
+      row_data[col] = (src_row_data[col] > ceiling_val ? ceiling_val :
+src_row_data[col]);
   }
 }
 
@@ -2173,12 +2271,14 @@ void MatrixBase<Real>::ExpSpecial(const MatrixBase<Real> &src) {
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
-      row_data[col] = (src_row_data[col] < Real(0) ? kaldi::Exp(src_row_data[col]) : (src_row_data[col] + Real(1)));
+      row_data[col] = (src_row_data[col] < Real(0) ?
+kaldi::Exp(src_row_data[col]) : (src_row_data[col] + Real(1)));
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
+void MatrixBase<Real>::ExpLimited(const MatrixBase<Real> &src, Real lower_limit,
+Real upper_limit) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
@@ -2188,11 +2288,11 @@ void MatrixBase<Real>::ExpLimited(const MatrixBase<Real> &src, Real lower_limit,
     for (MatrixIndexT col = 0; col < num_cols; col++) {
       const Real x = src_row_data[col];
       if (!(x >= lower_limit))
-	row_data[col] = kaldi::Exp(lower_limit);
+    row_data[col] = kaldi::Exp(lower_limit);
       else if (x > upper_limit)
-	row_data[col] = kaldi::Exp(upper_limit);
+    row_data[col] = kaldi::Exp(upper_limit);
       else
-	row_data[col] = kaldi::Exp(x);
+    row_data[col] = kaldi::Exp(x);
     }
   }
 }
@@ -2220,12 +2320,12 @@ bool MatrixBase<Real>::Power(Real power) {
   return true;
 }
 */
-template<typename Real>
+template <typename Real>
 void Matrix<Real>::Swap(Matrix<Real> *other) {
-  std::swap(this->data_, other->data_);
-  std::swap(this->num_cols_, other->num_cols_);
-  std::swap(this->num_rows_, other->num_rows_);
-  std::swap(this->stride_, other->stride_);
+    std::swap(this->data_, other->data_);
+    std::swap(this->num_cols_, other->num_cols_);
+    std::swap(this->num_rows_, other->num_rows_);
+    std::swap(this->stride_, other->stride_);
 }
 /*
 // Repeating this comment that appeared in the header:
@@ -2238,12 +2338,14 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
 // be block diagonal, with 2x2 blocks corresponding to any such pairs.  If a
 // pair is lambda +- i*mu, D will have a corresponding 2x2 block
 // [lambda, mu; -mu, lambda].
-// Note that if the input matrix (*this) is non-invertible, P may not be invertible
+// Note that if the input matrix (*this) is non-invertible, P may not be
+invertible
 // so in this case instead of the equation (*this) = P D P^{-1} holding, we have
 // instead (*this) P = P D.
 //
 // By making the pointer arguments non-NULL or NULL, the user can choose to take
-// not to take the eigenvalues directly, and/or the matrix D which is block-diagonal
+// not to take the eigenvalues directly, and/or the matrix D which is
+block-diagonal
 // with 2x2 blocks.
 template<typename Real>
 void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
@@ -2369,7 +2471,8 @@ template
 bool ReadHtk(std::istream &is, Matrix<double> *M, HtkHeader *header_ptr);
 
 template<typename Real>
-bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr) // header may be derived from a previous call to ReadHtk.  Must be in binary mode.
+bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr) //
+header may be derived from a previous call to ReadHtk.  Must be in binary mode.
 {
   KALDI_ASSERT(M.NumRows() == static_cast<MatrixIndexT>(htk_hdr.mNSamples));
   KALDI_ASSERT(M.NumCols() == static_cast<MatrixIndexT>(htk_hdr.mSampleSize) /
@@ -2471,12 +2574,14 @@ template <typename Real>
 Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                     const MatrixBase<Real> &B, MatrixTransposeType transB,
                     const MatrixBase<Real> &C, MatrixTransposeType transC) {
-  MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(), BCols = B.NumCols(),
+  MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(),
+BCols = B.NumCols(),
       CRows = C.NumRows(), CCols = C.NumCols();
   if (transA == kTrans) std::swap(ARows, ACols);
   if (transB == kTrans) std::swap(BRows, BCols);
   if (transC == kTrans) std::swap(CRows, CCols);
-  KALDI_ASSERT( CCols == ARows && ACols == BRows && BCols == CRows && "TraceMatMatMat: args have mismatched dimensions.");
+  KALDI_ASSERT( CCols == ARows && ACols == BRows && BCols == CRows &&
+"TraceMatMatMat: args have mismatched dimensions.");
   if (ARows*BCols < std::min(BRows*CCols, CRows*ACols)) {
     Matrix<Real> AB(ARows, BCols);
     AB.AddMatMat(1.0, A, transA, B, transB, 0.0);  // AB = A * B.
@@ -2508,13 +2613,16 @@ Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                        const MatrixBase<Real> &B, MatrixTransposeType transB,
                        const MatrixBase<Real> &C, MatrixTransposeType transC,
                        const MatrixBase<Real> &D, MatrixTransposeType transD) {
-  MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(), BCols = B.NumCols(),
-      CRows = C.NumRows(), CCols = C.NumCols(), DRows = D.NumRows(), DCols = D.NumCols();
+  MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(),
+BCols = B.NumCols(),
+      CRows = C.NumRows(), CCols = C.NumCols(), DRows = D.NumRows(), DCols =
+D.NumCols();
   if (transA == kTrans) std::swap(ARows, ACols);
   if (transB == kTrans) std::swap(BRows, BCols);
   if (transC == kTrans) std::swap(CRows, CCols);
   if (transD == kTrans) std::swap(DRows, DCols);
-  KALDI_ASSERT( DCols == ARows && ACols == BRows && BCols == CRows && CCols == DRows && "TraceMatMatMat: args have mismatched dimensions.");
+  KALDI_ASSERT( DCols == ARows && ACols == BRows && BCols == CRows && CCols ==
+DRows && "TraceMatMatMat: args have mismatched dimensions.");
   if (ARows*BCols < std::min(BRows*CCols, std::min(CRows*DCols, DRows*ACols))) {
     Matrix<Real> AB(ARows, BCols);
     AB.AddMatMat(1.0, A, transA, B, transB, 0.0);  // AB = A * B.
@@ -2541,13 +2649,18 @@ float TraceMatMatMatMat(const MatrixBase<float> &A, MatrixTransposeType transA,
                         const MatrixBase<float> &D, MatrixTransposeType transD);
 
 template
-double TraceMatMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA,
-                         const MatrixBase<double> &B, MatrixTransposeType transB,
-                         const MatrixBase<double> &C, MatrixTransposeType transC,
-                         const MatrixBase<double> &D, MatrixTransposeType transD);
+double TraceMatMatMatMat(const MatrixBase<double> &A, MatrixTransposeType
+transA,
+                         const MatrixBase<double> &B, MatrixTransposeType
+transB,
+                         const MatrixBase<double> &C, MatrixTransposeType
+transC,
+                         const MatrixBase<double> &D, MatrixTransposeType
+transD);
 
 template<typename Real> void  SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
-                                   MatrixBase<Real> *Vt, bool sort_on_absolute_value) {
+                                   MatrixBase<Real> *Vt, bool
+sort_on_absolute_value) {
   /// Makes sure the Svd is sorted (from greatest to least absolute value).
   MatrixIndexT num_singval = s->Dim();
   KALDI_ASSERT(U == NULL || U->NumCols() == num_singval);
@@ -2589,7 +2702,8 @@ void SortSvd(VectorBase<double> *s, MatrixBase<double> *U,
              MatrixBase<double> *Vt, bool);
 
 template<typename Real>
-void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &im,
+void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real>
+&im,
                             MatrixBase<Real> *D) {
   MatrixIndexT n = re.Dim();
   KALDI_ASSERT(im.Dim() == n && D->NumRows() == n && D->NumCols() == n);
@@ -2603,7 +2717,8 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &
     } else {  // First of a complex pair
       KALDI_ASSERT(j+1 < n && ApproxEqual(im(j+1), -im(j))
                    && ApproxEqual(re(j+1), re(j)));
-      /// if (im(j) < 0.0) KALDI_WARN << "Negative first im part of pair";  // TEMP
+      /// if (im(j) < 0.0) KALDI_WARN << "Negative first im part of pair";  //
+TEMP
       Real lambda = re(j), mu = im(j);
       // create 2x2 block [lambda, mu; -mu, lambda]
       (*D)(j, j) = lambda;
@@ -2616,10 +2731,12 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &
 }
 
 template
-void CreateEigenvalueMatrix(const VectorBase<float> &re, const VectorBase<float> &im,
+void CreateEigenvalueMatrix(const VectorBase<float> &re, const VectorBase<float>
+&im,
                             MatrixBase<float> *D);
 template
-void CreateEigenvalueMatrix(const VectorBase<double> &re, const VectorBase<double> &im,
+void CreateEigenvalueMatrix(const VectorBase<double> &re, const
+VectorBase<double> &im,
                             MatrixBase<double> *D);
 
 
@@ -2660,7 +2777,8 @@ bool AttemptComplexPower(double *x_re, double *x_im, double power);
 template <typename Real>
 Real TraceMatMat(const MatrixBase<Real> &A,
                   const MatrixBase<Real> &B,
-                  MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
+                  MatrixTransposeType trans) {  // tr(A B), equivalent to sum of
+each element of A times same element in B'
   MatrixIndexT aStride = A.stride_, bStride = B.stride_;
   if (trans == kNoTrans) {
     KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
@@ -2791,29 +2909,32 @@ void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
   }
 }
 */
-template<typename Real>
+template <typename Real>
 void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
                                 const MatrixIndexT *indices) {
-  KALDI_ASSERT(NumRows() == src.NumRows());
-  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
-      this_stride = stride_, src_stride = src.stride_;
-  Real *this_data = this->data_;
-  const Real *src_data = src.data_;
+    KALDI_ASSERT(NumRows() == src.NumRows());
+    MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
+                 this_stride = stride_, src_stride = src.stride_;
+    Real *this_data = this->data_;
+    const Real *src_data = src.data_;
 #ifdef KALDI_PARANOID
-  MatrixIndexT src_cols = src.NumCols();
-  for (MatrixIndexT i = 0; i < num_cols; i++)
-    KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols);
+    MatrixIndexT src_cols = src.NumCols();
+    for (MatrixIndexT i = 0; i < num_cols; i++)
+        KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols);
 #endif
 
-  // For the sake of memory locality we do this row by row, rather
-  // than doing it column-wise using cublas_Xcopy
-  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
-    const MatrixIndexT *index_ptr = &(indices[0]);
-    for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
-      if (*index_ptr < 0) this_data[c] = 0;
-      else this_data[c] = src_data[*index_ptr];
+    // For the sake of memory locality we do this row by row, rather
+    // than doing it column-wise using cublas_Xcopy
+    for (MatrixIndexT r = 0; r < num_rows;
+         r++, this_data += this_stride, src_data += src_stride) {
+        const MatrixIndexT *index_ptr = &(indices[0]);
+        for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
+            if (*index_ptr < 0)
+                this_data[c] = 0;
+            else
+                this_data[c] = src_data[*index_ptr];
+        }
     }
-  }
 }
 
 /*
@@ -2833,7 +2954,8 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
 
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
-  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
+  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data
++= src_stride) {
     const MatrixIndexT *index_ptr = &(indices[0]);
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr >= 0)
@@ -2965,7 +3087,8 @@ void MatrixBase<Real>::DiffSigmoid(const MatrixBase<Real> &value,
                                    const MatrixBase<Real> &diff) {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
-      stride = stride_, value_stride = value.stride_, diff_stride = diff.stride_;
+      stride = stride_, value_stride = value.stride_, diff_stride =
+diff.stride_;
   Real *data = data_;
   const Real *value_data = value.data_, *diff_data = diff.data_;
   for (MatrixIndexT r = 0; r < num_rows; r++) {
@@ -2982,7 +3105,8 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
                                    const MatrixBase<Real> &diff) {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
-      stride = stride_, value_stride = value.stride_, diff_stride = diff.stride_;
+      stride = stride_, value_stride = value.stride_, diff_stride =
+diff.stride_;
   Real *data = data_;
   const Real *value_data = value.data_, *diff_data = diff.data_;
   for (MatrixIndexT r = 0; r < num_rows; r++) {
@@ -2997,7 +3121,8 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
 /*
 template<typename Real>
 template<typename OtherReal>
-void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
+void MatrixBase<Real>::AddVecToRows(const Real alpha, const
+VectorBase<OtherReal> &v) {
   const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       stride = stride_;
   KALDI_ASSERT(v.Dim() == num_cols);
@@ -3028,7 +3153,8 @@ template void MatrixBase<double>::AddVecToRows(const double alpha,
 
 template<typename Real>
 template<typename OtherReal>
-void MatrixBase<Real>::AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v) {
+void MatrixBase<Real>::AddVecToCols(const Real alpha, const
+VectorBase<OtherReal> &v) {
   const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       stride = stride_;
   KALDI_ASSERT(v.Dim() == num_rows);
@@ -3058,10 +3184,10 @@ template void MatrixBase<double>::AddVecToCols(const double alpha,
 template void MatrixBase<double>::AddVecToCols(const double alpha,
                                                const VectorBase<double> &v);
 */
-//Explicit instantiation of the classes
-//Apparently, it seems to be necessary that the instantiation
-//happens at the end of the file. Otherwise, not all the member
-//functions will get instantiated.
+// Explicit instantiation of the classes
+// Apparently, it seems to be necessary that the instantiation
+// happens at the end of the file. Otherwise, not all the member
+// functions will get instantiated.
 
 template class Matrix<float>;
 template class Matrix<double>;
@@ -3070,4 +3196,4 @@ template class MatrixBase<double>;
 template class SubMatrix<float>;
 template class SubMatrix<double>;
 
-} // namespace kaldi
+}  // namespace kaldi
diff --git a/runtime/engine/common/matrix/kaldi-matrix.h b/runtime/engine/common/matrix/kaldi-matrix.h
index 92274487e..c082a731c 100644
--- a/runtime/engine/common/matrix/kaldi-matrix.h
+++ b/runtime/engine/common/matrix/kaldi-matrix.h
@@ -38,669 +38,715 @@ namespace kaldi {
 /// Base class which provides matrix operations not involving resizing
 /// or allocation.   Classes Matrix and SubMatrix inherit from it and take care
 /// of allocation and resizing.
-template<typename Real>
+template <typename Real>
 class MatrixBase {
- public:
-  // so this child can access protected members of other instances.
-  friend class Matrix<Real>;
-  friend class SubMatrix<Real>;
-  // friend declarations for CUDA matrices (see ../cudamatrix/)
-
-  /// Returns number of rows (or zero for empty matrix).
-  inline MatrixIndexT  NumRows() const { return num_rows_; }
-
-  /// Returns number of columns (or zero for empty matrix).
-  inline MatrixIndexT NumCols() const { return num_cols_; }
-
-  /// Stride (distance in memory between each row).  Will be >= NumCols.
-  inline MatrixIndexT Stride() const {  return stride_; }
-
-  /// Returns size in bytes of the data held by the matrix.
-  size_t  SizeInBytes() const {
-    return static_cast<size_t>(num_rows_) * static_cast<size_t>(stride_) *
-        sizeof(Real);
-  }
-
-  /// Gives pointer to raw data (const).
-  inline const Real* Data() const {
-    return data_;
-  }
-
-  /// Gives pointer to raw data (non-const).
-  inline Real* Data() { return data_; }
-
-  /// Returns pointer to data for one row (non-const)
-  inline  Real* RowData(MatrixIndexT i) {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
-  }
-
-  /// Returns pointer to data for one row (const)
-  inline const Real* RowData(MatrixIndexT i) const {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
-  }
-
-  /// Indexing operator, non-const
-  /// (only checks sizes if compiled with -DKALDI_PARANOID)
-  inline Real&  operator() (MatrixIndexT r, MatrixIndexT c) {
-    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
-                          static_cast<UnsignedMatrixIndexT>(c) <
-                          static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
-  }
-  /// Indexing operator, provided for ease of debugging (gdb doesn't work
-  /// with parenthesis operator).
-  Real &Index (MatrixIndexT r, MatrixIndexT c) {  return (*this)(r, c); }
-
-  /// Indexing operator, const
-  /// (only checks sizes if compiled with -DKALDI_PARANOID)
-  inline const Real operator() (MatrixIndexT r, MatrixIndexT c) const {
-    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
-                          static_cast<UnsignedMatrixIndexT>(c) <
-                          static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
-  }
-
-  /*   Basic setting-to-special values functions. */
-
-  /// Sets matrix to zero.
-  void SetZero();
-  /// Sets all elements to a specific value.
-  void Set(Real);
-  /// Sets to zero, except ones along diagonal [for non-square matrices too]
-
-  /// Copy given matrix. (no resize is done).
-  template<typename OtherReal>
-  void CopyFromMat(const MatrixBase<OtherReal> & M,
-                   MatrixTransposeType trans = kNoTrans);
-
-  /// Copy from compressed matrix.
-  //void CopyFromMat(const CompressedMatrix &M);
-
-  /// Copy given tpmatrix. (no resize is done).
-  //template<typename OtherReal>
-  //void CopyFromTp(const TpMatrix<OtherReal> &M,
-                  //MatrixTransposeType trans = kNoTrans);
-
-  /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
-  //template<typename OtherReal>
-  //void CopyFromMat(const CuMatrixBase<OtherReal> &M,
-                   //MatrixTransposeType trans = kNoTrans);
-
-  /// This function has two modes of operation.  If v.Dim() == NumRows() *
-  /// NumCols(), then treats the vector as a row-by-row concatenation of a
-  /// matrix and copies to *this.
-  /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v.
-  void CopyRowsFromVec(const VectorBase<Real> &v);
-
-  /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
-  //void CopyRowsFromVec(const CuVectorBase<Real> &v);
-
-  template<typename OtherReal>
-  void CopyRowsFromVec(const VectorBase<OtherReal> &v);
-
-  /// Copies vector into matrix, column-by-column.
-  /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows();
-  /// this has two modes of operation.
-  void CopyColsFromVec(const VectorBase<Real> &v);
-
-  /// Copy vector into specific column of matrix.
-  void CopyColFromVec(const VectorBase<Real> &v, const MatrixIndexT col);
-  /// Copy vector into specific row of matrix.
-  void CopyRowFromVec(const VectorBase<Real> &v, const MatrixIndexT row);
-  /// Copy vector into diagonal of matrix.
-  void CopyDiagFromVec(const VectorBase<Real> &v);
-
-  /* Accessing of sub-parts of the matrix. */
-
-  /// Return specific row of matrix [const].
-  inline const SubVector<Real> Row(MatrixIndexT i) const {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
-  }
-
-  /// Return specific row of matrix.
-  inline SubVector<Real> Row(MatrixIndexT i) {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
-  }
-
-  /// Return a sub-part of matrix.
-  inline SubMatrix<Real> Range(const MatrixIndexT row_offset,
-                               const MatrixIndexT num_rows,
-                               const MatrixIndexT col_offset,
-                               const MatrixIndexT num_cols) const {
-    return SubMatrix<Real>(*this, row_offset, num_rows,
-                           col_offset, num_cols);
-  }
-  inline SubMatrix<Real> RowRange(const MatrixIndexT row_offset,
-                                  const MatrixIndexT num_rows) const {
-    return SubMatrix<Real>(*this, row_offset, num_rows, 0, num_cols_);
-  }
-  inline SubMatrix<Real> ColRange(const MatrixIndexT col_offset,
-                                  const MatrixIndexT num_cols) const {
-    return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
-  }
-
-/*
-  /// Returns sum of all elements in matrix.
-  Real Sum() const;
-  /// Returns trace of matrix.
-  Real Trace(bool check_square = true) const;
-  // If check_square = true, will crash if matrix is not square.
-
-  /// Returns maximum element of matrix.
-  Real Max() const;
-  /// Returns minimum element of matrix.
-  Real Min() const;
-
-  /// Element by element multiplication with a given matrix.
-  void MulElements(const MatrixBase<Real> &A);
-
-  /// Divide each element by the corresponding element of a given matrix.
-  void DivElements(const MatrixBase<Real> &A);
-
-  /// Multiply each element with a scalar value.
-  void Scale(Real alpha);
-
-  /// Set, element-by-element, *this = max(*this, A)
-  void Max(const MatrixBase<Real> &A);
-  /// Set, element-by-element, *this = min(*this, A)
-  void Min(const MatrixBase<Real> &A);
-
-  /// Equivalent to (*this) = (*this) * diag(scale).  Scaling
-  /// each column by a scalar taken from that dimension of the vector.
-  void MulColsVec(const VectorBase<Real> &scale);
-
-  /// Equivalent to (*this) = diag(scale) * (*this).  Scaling
-  /// each row by a scalar taken from that dimension of the vector.
-  void MulRowsVec(const VectorBase<Real> &scale);
-
-  /// Divide each row into src.NumCols() equal groups, and then scale i'th row's
-  /// j'th group of elements by src(i, j).  Requires src.NumRows() ==
-  /// this->NumRows() and this->NumCols() % src.NumCols() == 0.
-  void MulRowsGroupMat(const MatrixBase<Real> &src);
-
-  /// Returns logdet of matrix.
-  Real LogDet(Real *det_sign = NULL) const;
-
-  /// matrix inverse.
-  /// if inverse_needed = false, will fill matrix with garbage.
-  /// (only useful if logdet wanted).
-  void Invert(Real *log_det = NULL, Real *det_sign = NULL,
-              bool inverse_needed = true);
-  /// matrix inverse [double].
-  /// if inverse_needed = false, will fill matrix with garbage
-  /// (only useful if logdet wanted).
-  /// Does inversion in double precision even if matrix was not double.
-  void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL,
-                      bool inverse_needed = true);
-*/
-  /// Inverts all the elements of the matrix
-  void InvertElements();
-/*
-  /// Transpose the matrix.  This one is only
-  /// applicable to square matrices (the one in the
-  /// Matrix child class works also for non-square.
-  void Transpose();
-
-*/
-  /// Copies column r from column indices[r] of src.
-  /// As a special case, if indexes[i] == -1, sets column i to zero.
-  /// all elements of "indices" must be in [-1, src.NumCols()-1],
-  /// and src.NumRows() must equal this.NumRows()
-  void CopyCols(const MatrixBase<Real> &src,
-                const MatrixIndexT *indices);
-
-  /// Copies row r from row indices[r] of src (does nothing
-  /// As a special case, if indexes[i] == -1, sets row i to zero.
-  /// all elements of "indices" must be in [-1, src.NumRows()-1],
-  /// and src.NumCols() must equal this.NumCols()
-  void CopyRows(const MatrixBase<Real> &src,
-                const MatrixIndexT *indices);
-
-  /// Add column indices[r] of src to column r.
-  /// As a special case, if indexes[i] == -1, skip column i
-  /// indices.size() must equal this->NumCols(),
-  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
-  /// and src.NumRows() must equal this.NumRows()
-  //void AddCols(const MatrixBase<Real> &src,
-   //            const MatrixIndexT *indices);
-
-  /// Copies row r of this matrix from an array of floats at the location given
-  /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
-  /// Note: we are using "pointer to const pointer to const object" for "src",
-  ///       because we may create "src" by calling Data() of const CuArray
-  void CopyRows(const Real *const *src);
-
-  /// Copies row r of this matrix to the array of floats at the location given
-  /// by dst[r]. If dst[r] is NULL, does not copy anywhere.  Requires that none
-  /// of the memory regions pointed to by the pointers in "dst" overlap (e.g.
-  /// none of the pointers should be the same).
-  void CopyToRows(Real *const *dst) const;
-
-  /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
-  /// If indexes[r] < 0, does not add anything. all elements of "indexes" must
-  /// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols().
- // void AddRows(Real alpha,
-  //             const MatrixBase<Real> &src,
-   //            const MatrixIndexT *indexes);
-
-  /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the
-  /// beginning of a region of memory representing a vector of floats, of the
-  /// same length as this.NumCols(). If src[r] is NULL, does not add anything.
-  //void AddRows(Real alpha, const Real *const *src);
-
-  /// For each row r of this matrix, adds it (times alpha) to the array of
-  /// floats at the location given by dst[r]. If dst[r] is NULL, does not do
-  /// anything for that row. Requires that none of the memory regions pointed
-  /// to by the pointers in "dst" overlap (e.g. none of the pointers should be
-  /// the same).
-  //void AddToRows(Real alpha, Real *const *dst) const;
-
-  /// For each row i of *this, adds this->Row(i) to
-  /// dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing.
-  /// Requires that all the indexes[i] that are >= 0
-  /// be distinct, otherwise the behavior is undefined.
-  //void AddToRows(Real alpha,
-   //              const MatrixIndexT *indexes,
+  public:
+    // so this child can access protected members of other instances.
+    friend class Matrix<Real>;
+    friend class SubMatrix<Real>;
+    // friend declarations for CUDA matrices (see ../cudamatrix/)
+
+    /// Returns number of rows (or zero for empty matrix).
+    inline MatrixIndexT NumRows() const { return num_rows_; }
+
+    /// Returns number of columns (or zero for empty matrix).
+    inline MatrixIndexT NumCols() const { return num_cols_; }
+
+    /// Stride (distance in memory between each row).  Will be >= NumCols.
+    inline MatrixIndexT Stride() const { return stride_; }
+
+    /// Returns size in bytes of the data held by the matrix.
+    size_t SizeInBytes() const {
+        return static_cast<size_t>(num_rows_) * static_cast<size_t>(stride_) *
+               sizeof(Real);
+    }
+
+    /// Gives pointer to raw data (const).
+    inline const Real *Data() const { return data_; }
+
+    /// Gives pointer to raw data (non-const).
+    inline Real *Data() { return data_; }
+
+    /// Returns pointer to data for one row (non-const)
+    inline Real *RowData(MatrixIndexT i) {
+        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                     static_cast<UnsignedMatrixIndexT>(num_rows_));
+        return data_ + i * stride_;
+    }
+
+    /// Returns pointer to data for one row (const)
+    inline const Real *RowData(MatrixIndexT i) const {
+        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                     static_cast<UnsignedMatrixIndexT>(num_rows_));
+        return data_ + i * stride_;
+    }
+
+    /// Indexing operator, non-const
+    /// (only checks sizes if compiled with -DKALDI_PARANOID)
+    inline Real &operator()(MatrixIndexT r, MatrixIndexT c) {
+        KALDI_PARANOID_ASSERT(
+            static_cast<UnsignedMatrixIndexT>(r) <
+                static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+            static_cast<UnsignedMatrixIndexT>(c) <
+                static_cast<UnsignedMatrixIndexT>(num_cols_));
+        return *(data_ + r * stride_ + c);
+    }
+    /// Indexing operator, provided for ease of debugging (gdb doesn't work
+    /// with parenthesis operator).
+    Real &Index(MatrixIndexT r, MatrixIndexT c) { return (*this)(r, c); }
+
+    /// Indexing operator, const
+    /// (only checks sizes if compiled with -DKALDI_PARANOID)
+    inline const Real operator()(MatrixIndexT r, MatrixIndexT c) const {
+        KALDI_PARANOID_ASSERT(
+            static_cast<UnsignedMatrixIndexT>(r) <
+                static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+            static_cast<UnsignedMatrixIndexT>(c) <
+                static_cast<UnsignedMatrixIndexT>(num_cols_));
+        return *(data_ + r * stride_ + c);
+    }
+
+    /*   Basic setting-to-special values functions. */
+
+    /// Sets matrix to zero.
+    void SetZero();
+    /// Sets all elements to a specific value.
+    void Set(Real);
+    /// Sets to zero, except ones along diagonal [for non-square matrices too]
+
+    /// Copy given matrix. (no resize is done).
+    template <typename OtherReal>
+    void CopyFromMat(const MatrixBase<OtherReal> &M,
+                     MatrixTransposeType trans = kNoTrans);
+
+    /// Copy from compressed matrix.
+    // void CopyFromMat(const CompressedMatrix &M);
+
+    /// Copy given tpmatrix. (no resize is done).
+    // template<typename OtherReal>
+    // void CopyFromTp(const TpMatrix<OtherReal> &M,
+    // MatrixTransposeType trans = kNoTrans);
+
+    /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
+    // template<typename OtherReal>
+    // void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+    // MatrixTransposeType trans = kNoTrans);
+
+    /// This function has two modes of operation.  If v.Dim() == NumRows() *
+    /// NumCols(), then treats the vector as a row-by-row concatenation of a
+    /// matrix and copies to *this.
+    /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v.
+    void CopyRowsFromVec(const VectorBase<Real> &v);
+
+    /// This version of CopyRowsFromVec is implemented in
+    /// ../cudamatrix/cu-vector.cc
+    // void CopyRowsFromVec(const CuVectorBase<Real> &v);
+
+    template <typename OtherReal>
+    void CopyRowsFromVec(const VectorBase<OtherReal> &v);
+
+    /// Copies vector into matrix, column-by-column.
+    /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows();
+    /// this has two modes of operation.
+    void CopyColsFromVec(const VectorBase<Real> &v);
+
+    /// Copy vector into specific column of matrix.
+    void CopyColFromVec(const VectorBase<Real> &v, const MatrixIndexT col);
+    /// Copy vector into specific row of matrix.
+    void CopyRowFromVec(const VectorBase<Real> &v, const MatrixIndexT row);
+    /// Copy vector into diagonal of matrix.
+    void CopyDiagFromVec(const VectorBase<Real> &v);
+
+    /* Accessing of sub-parts of the matrix. */
+
+    /// Return specific row of matrix [const].
+    inline const SubVector<Real> Row(MatrixIndexT i) const {
+        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                     static_cast<UnsignedMatrixIndexT>(num_rows_));
+        return SubVector<Real>(data_ + (i * stride_), NumCols());
+    }
+
+    /// Return specific row of matrix.
+    inline SubVector<Real> Row(MatrixIndexT i) {
+        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                     static_cast<UnsignedMatrixIndexT>(num_rows_));
+        return SubVector<Real>(data_ + (i * stride_), NumCols());
+    }
+
+    /// Return a sub-part of matrix.
+    inline SubMatrix<Real> Range(const MatrixIndexT row_offset,
+                                 const MatrixIndexT num_rows,
+                                 const MatrixIndexT col_offset,
+                                 const MatrixIndexT num_cols) const {
+        return SubMatrix<Real>(
+            *this, row_offset, num_rows, col_offset, num_cols);
+    }
+    inline SubMatrix<Real> RowRange(const MatrixIndexT row_offset,
+                                    const MatrixIndexT num_rows) const {
+        return SubMatrix<Real>(*this, row_offset, num_rows, 0, num_cols_);
+    }
+    inline SubMatrix<Real> ColRange(const MatrixIndexT col_offset,
+                                    const MatrixIndexT num_cols) const {
+        return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
+    }
+
+    /*
+      /// Returns sum of all elements in matrix.
+      Real Sum() const;
+      /// Returns trace of matrix.
+      Real Trace(bool check_square = true) const;
+      // If check_square = true, will crash if matrix is not square.
+
+      /// Returns maximum element of matrix.
+      Real Max() const;
+      /// Returns minimum element of matrix.
+      Real Min() const;
+
+      /// Element by element multiplication with a given matrix.
+      void MulElements(const MatrixBase<Real> &A);
+
+      /// Divide each element by the corresponding element of a given matrix.
+      void DivElements(const MatrixBase<Real> &A);
+
+      /// Multiply each element with a scalar value.
+      void Scale(Real alpha);
+
+      /// Set, element-by-element, *this = max(*this, A)
+      void Max(const MatrixBase<Real> &A);
+      /// Set, element-by-element, *this = min(*this, A)
+      void Min(const MatrixBase<Real> &A);
+
+      /// Equivalent to (*this) = (*this) * diag(scale).  Scaling
+      /// each column by a scalar taken from that dimension of the vector.
+      void MulColsVec(const VectorBase<Real> &scale);
+
+      /// Equivalent to (*this) = diag(scale) * (*this).  Scaling
+      /// each row by a scalar taken from that dimension of the vector.
+      void MulRowsVec(const VectorBase<Real> &scale);
+
+      /// Divide each row into src.NumCols() equal groups, and then scale i'th
+      row's
+      /// j'th group of elements by src(i, j).  Requires src.NumRows() ==
+      /// this->NumRows() and this->NumCols() % src.NumCols() == 0.
+      void MulRowsGroupMat(const MatrixBase<Real> &src);
+
+      /// Returns logdet of matrix.
+      Real LogDet(Real *det_sign = NULL) const;
+
+      /// matrix inverse.
+      /// if inverse_needed = false, will fill matrix with garbage.
+      /// (only useful if logdet wanted).
+      void Invert(Real *log_det = NULL, Real *det_sign = NULL,
+                  bool inverse_needed = true);
+      /// matrix inverse [double].
+      /// if inverse_needed = false, will fill matrix with garbage
+      /// (only useful if logdet wanted).
+      /// Does inversion in double precision even if matrix was not double.
+      void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL,
+                          bool inverse_needed = true);
+    */
+    /// Inverts all the elements of the matrix
+    void InvertElements();
+    /*
+      /// Transpose the matrix.  This one is only
+      /// applicable to square matrices (the one in the
+      /// Matrix child class works also for non-square.
+      void Transpose();
+
+    */
+    /// Copies column r from column indices[r] of src.
+    /// As a special case, if indexes[i] == -1, sets column i to zero.
+    /// all elements of "indices" must be in [-1, src.NumCols()-1],
+    /// and src.NumRows() must equal this.NumRows()
+    void CopyCols(const MatrixBase<Real> &src, const MatrixIndexT *indices);
+
+    /// Copies row r from row indices[r] of src (does nothing
+    /// As a special case, if indexes[i] == -1, sets row i to zero.
+    /// all elements of "indices" must be in [-1, src.NumRows()-1],
+    /// and src.NumCols() must equal this.NumCols()
+    void CopyRows(const MatrixBase<Real> &src, const MatrixIndexT *indices);
+
+    /// Add column indices[r] of src to column r.
+    /// As a special case, if indexes[i] == -1, skip column i
+    /// indices.size() must equal this->NumCols(),
+    /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+    /// and src.NumRows() must equal this.NumRows()
+    // void AddCols(const MatrixBase<Real> &src,
+    //            const MatrixIndexT *indices);
+
+    /// Copies row r of this matrix from an array of floats at the location
+    /// given
+    /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
+    /// Note: we are using "pointer to const pointer to const object" for "src",
+    ///       because we may create "src" by calling Data() of const CuArray
+    void CopyRows(const Real *const *src);
+
+    /// Copies row r of this matrix to the array of floats at the location given
+    /// by dst[r]. If dst[r] is NULL, does not copy anywhere.  Requires that
+    /// none
+    /// of the memory regions pointed to by the pointers in "dst" overlap (e.g.
+    /// none of the pointers should be the same).
+    void CopyToRows(Real *const *dst) const;
+
+    /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
+    /// If indexes[r] < 0, does not add anything. all elements of "indexes" must
+    /// be in [-1, src.NumRows()-1], and src.NumCols() must equal
+    /// this.NumCols().
+    // void AddRows(Real alpha,
+    //             const MatrixBase<Real> &src,
+    //            const MatrixIndexT *indexes);
+
+    /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as
+    /// the
+    /// beginning of a region of memory representing a vector of floats, of the
+    /// same length as this.NumCols(). If src[r] is NULL, does not add anything.
+    // void AddRows(Real alpha, const Real *const *src);
+
+    /// For each row r of this matrix, adds it (times alpha) to the array of
+    /// floats at the location given by dst[r]. If dst[r] is NULL, does not do
+    /// anything for that row. Requires that none of the memory regions pointed
+    /// to by the pointers in "dst" overlap (e.g. none of the pointers should be
+    /// the same).
+    // void AddToRows(Real alpha, Real *const *dst) const;
+
+    /// For each row i of *this, adds this->Row(i) to
+    /// dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing.
+    /// Requires that all the indexes[i] that are >= 0
+    /// be distinct, otherwise the behavior is undefined.
+    // void AddToRows(Real alpha,
+    //              const MatrixIndexT *indexes,
     //             MatrixBase<Real> *dst) const;
-/*
-  inline void ApplyPow(Real power) {
-    this -> Pow(*this, power);
-  }
-
-
-  inline void ApplyPowAbs(Real power, bool include_sign=false) {
-    this -> PowAbs(*this, power, include_sign);
-  }
-
-  inline void ApplyHeaviside() {
-    this -> Heaviside(*this);
-  }
-
-  inline void ApplyFloor(Real floor_val) {
-    this -> Floor(*this, floor_val);
-  }
-
-  inline void ApplyCeiling(Real ceiling_val) {
-    this -> Ceiling(*this, ceiling_val);
-  }
-
-  inline void ApplyExp() {
-    this -> Exp(*this);
-  }
-
-  inline void ApplyExpSpecial() {
-    this -> ExpSpecial(*this);
-  }
-
-  inline void ApplyExpLimited(Real lower_limit, Real upper_limit) {
-    this -> ExpLimited(*this, lower_limit, upper_limit);
-  }
-
-  inline void ApplyLog() {
-    this -> Log(*this);
-  }
-*/
-  /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
-  /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
-  /// slightly complicated, due to the need for P to be real.  In the symmetric
-  /// case D is diagonal and real, but in
-  /// the non-symmetric case there may be complex-conjugate pairs of eigenvalues.
-  /// In this case, for the equation (*this) = P D P^{-1} to hold, D must actually
-  /// be block diagonal, with 2x2 blocks corresponding to any such pairs.  If a
-  /// pair is lambda +- i*mu, D will have a corresponding 2x2 block
-  /// [lambda, mu; -mu, lambda].
-  /// Note that if the input matrix (*this) is non-invertible, P may not be invertible
-  /// so in this case instead of the equation (*this) = P D P^{-1} holding, we have
-  /// instead (*this) P = P D.
-  ///
-  /// The non-member function CreateEigenvalueMatrix creates D from eigs_real and eigs_imag.
-  //void Eig(MatrixBase<Real> *P,
-   //        VectorBase<Real> *eigs_real,
+    /*
+      inline void ApplyPow(Real power) {
+        this -> Pow(*this, power);
+      }
+
+
+      inline void ApplyPowAbs(Real power, bool include_sign=false) {
+        this -> PowAbs(*this, power, include_sign);
+      }
+
+      inline void ApplyHeaviside() {
+        this -> Heaviside(*this);
+      }
+
+      inline void ApplyFloor(Real floor_val) {
+        this -> Floor(*this, floor_val);
+      }
+
+      inline void ApplyCeiling(Real ceiling_val) {
+        this -> Ceiling(*this, ceiling_val);
+      }
+
+      inline void ApplyExp() {
+        this -> Exp(*this);
+      }
+
+      inline void ApplyExpSpecial() {
+        this -> ExpSpecial(*this);
+      }
+
+      inline void ApplyExpLimited(Real lower_limit, Real upper_limit) {
+        this -> ExpLimited(*this, lower_limit, upper_limit);
+      }
+
+      inline void ApplyLog() {
+        this -> Log(*this);
+      }
+    */
+    /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) =
+    /// P D
+    /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output
+    /// is
+    /// slightly complicated, due to the need for P to be real.  In the
+    /// symmetric
+    /// case D is diagonal and real, but in
+    /// the non-symmetric case there may be complex-conjugate pairs of
+    /// eigenvalues.
+    /// In this case, for the equation (*this) = P D P^{-1} to hold, D must
+    /// actually
+    /// be block diagonal, with 2x2 blocks corresponding to any such pairs.  If
+    /// a
+    /// pair is lambda +- i*mu, D will have a corresponding 2x2 block
+    /// [lambda, mu; -mu, lambda].
+    /// Note that if the input matrix (*this) is non-invertible, P may not be
+    /// invertible
+    /// so in this case instead of the equation (*this) = P D P^{-1} holding, we
+    /// have
+    /// instead (*this) P = P D.
+    ///
+    /// The non-member function CreateEigenvalueMatrix creates D from eigs_real
+    /// and eigs_imag.
+    // void Eig(MatrixBase<Real> *P,
+    //        VectorBase<Real> *eigs_real,
     //       VectorBase<Real> *eigs_imag) const;
 
-  /// The Power method attempts to take the matrix to a power using a method that
-  /// works in general for fractional and negative powers.  The input matrix must
-  /// be invertible and have reasonable condition (or we don't guarantee the
-  /// results.  The method is based on the eigenvalue decomposition.  It will
-  /// return false and leave the matrix unchanged, if at entry the matrix had
-  /// real negative eigenvalues (or if it had zero eigenvalues and the power was
-  /// negative).
-//  bool Power(Real pow);
-
-  /** Singular value decomposition
-     Major limitations:
-     For nonsquare matrices, we assume m>=n (NumRows >= NumCols), and we return
-     the "skinny" Svd, i.e. the matrix in the middle is diagonal, and the
-     one on the left is rectangular.
-
-     In Svd, *this = U*diag(S)*Vt.
-     Null pointers for U and/or Vt at input mean we do not want that output.  We
-     expect that S.Dim() == m, U is either NULL or m by n,
-     and v is either NULL or n by n.
-     The singular values are not sorted (use SortSvd for that).  */
-  //void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
-   //                   MatrixBase<Real> *Vt);  // Destroys calling matrix.
-
-  /// Compute SVD (*this) = U diag(s) Vt.   Note that the V in the call is already
-  /// transposed; the normal formulation is U diag(s) V^T.
-  /// Null pointers for U or V mean we don't want that output (this saves
-  /// compute).  The singular values are not sorted (use SortSvd for that).
-  //void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
-   //        MatrixBase<Real> *Vt) const;
-  /// Compute SVD but only retain the singular values.
-  //void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
-
-
-  /// Returns smallest singular value.
-  //Real MinSingularValue() const {
-   // Vector<Real> tmp(std::min(NumRows(), NumCols()));
-    //Svd(&tmp);
-    //return tmp.Min();
-  //}
-
-  //void TestUninitialized() const; // This function is designed so that if any element
-  // if the matrix is uninitialized memory, valgrind will complain.
-
-  /// Returns condition number by computing Svd.  Works even if cols > rows.
-  /// Returns infinity if all singular values are zero.
-  /*
-  Real Cond() const;
-
-  /// Returns true if matrix is Symmetric.
-  bool IsSymmetric(Real cutoff = 1.0e-05) const;  // replace magic number
-
-  /// Returns true if matrix is Diagonal.
-  bool IsDiagonal(Real cutoff = 1.0e-05) const;  // replace magic number
-
-  /// Returns true if the matrix is all zeros, except for ones on diagonal.  (it
-  /// does not have to be square).  More specifically, this function returns
-  /// false if for any i, j, (*this)(i, j) differs by more than cutoff from the
-  /// expression (i == j ? 1 : 0).
-  bool IsUnit(Real cutoff = 1.0e-05) const;     // replace magic number
-
-  /// Returns true if matrix is all zeros.
-  bool IsZero(Real cutoff = 1.0e-05) const;     // replace magic number
-
-  /// Frobenius norm, which is the sqrt of sum of square elements.  Same as Schatten 2-norm,
-  /// or just "2-norm".
-  Real FrobeniusNorm() const;
-
-  /// Returns true if ((*this)-other).FrobeniusNorm()
-  /// <= tol * (*this).FrobeniusNorm().
-  bool ApproxEqual(const MatrixBase<Real> &other, float tol = 0.01) const;
-
-  /// Tests for exact equality.  It's usually preferable to use ApproxEqual.
-  bool Equal(const MatrixBase<Real> &other) const;
-
-  /// largest absolute value.
-  Real LargestAbsElem() const;  // largest absolute value.
-
-  /// Returns log(sum(exp())) without exp overflow
-  /// If prune > 0.0, it uses a pruning beam, discarding
-  /// terms less than (max - prune).  Note: in future
-  /// we may change this so that if prune = 0.0, it takes
-  /// the max, so use -1 if you don't want to prune.
-  Real LogSumExp(Real prune = -1.0) const;
-
-  /// Apply soft-max to the collection of all elements of the
-  /// matrix and return normalizer (log sum of exponentials).
-  Real ApplySoftMax();
-
-  /// Set each element to the sigmoid of the corresponding element of "src".
-  void Sigmoid(const MatrixBase<Real> &src);
-
-  /// Sets each element to the Heaviside step function (x > 0 ? 1 : 0) of the
-  /// corresponding element in "src".  Note: in general you can make different
-  /// choices for x = 0, but for now please leave it as it (i.e. returning zero)
-  /// because it affects the RectifiedLinearComponent in the neural net code.
-  void Heaviside(const MatrixBase<Real> &src);
-
-  void Exp(const MatrixBase<Real> &src);
-
-  void Pow(const MatrixBase<Real> &src, Real power);
-
-  void Log(const MatrixBase<Real> &src);
-
-  /// Apply power to the absolute value of each element.
-  /// If include_sign is true, the result will be multiplied with
-  /// the sign of the input value.
-  /// If the power is negative and the input to the power is zero,
-  /// The output will be set zero. If include_sign is true, it will
-  /// multiply the result by the sign of the input.
-  void PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign=false);
-
-  void Floor(const MatrixBase<Real> &src, Real floor_val);
-
-  void Ceiling(const MatrixBase<Real> &src, Real ceiling_val);
-
-  /// For each element x of the matrix, set it to
-  /// (x < 0 ? exp(x) : x + 1).  This function is used
-  /// in our RNNLM training.
-  void ExpSpecial(const MatrixBase<Real> &src);
-
-  /// This is equivalent to running:
-  /// Floor(src, lower_limit);
-  /// Ceiling(src, upper_limit);
-  /// Exp(src)
-  void ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit);
-
-  /// Set each element to y = log(1 + exp(x))
-  void SoftHinge(const MatrixBase<Real> &src);
-
-  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
-  /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
-  void GroupPnorm(const MatrixBase<Real> &src, Real power);
-
-  /// Calculate derivatives for the GroupPnorm function above...
-  /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
-  /// and "output" is the result of the computation (i.e. the "this" of that function
-  /// call), and *this has the same dimension as "input", then it sets each element
-  /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
-  /// "output-elem" is whichever element of output depends on that input element.
-  void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output,
-                       Real power);
-
-  /// Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j
-  /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
-  void GroupMax(const MatrixBase<Real> &src);
-
-  /// Calculate derivatives for the GroupMax function above, where
-  /// "input" is the input to the GroupMax function above (i.e. the "src" variable),
-  /// and "output" is the result of the computation (i.e. the "this" of that function
-  /// call), and *this must have the same dimension as "input". Each element
-  /// of *this will be set to 1 if the corresponding input equals the output of
-  /// the group, and 0 otherwise. The equals the function derivative where it is
-  /// defined (it's not defined where multiple inputs in the group are equal to the output).
-  void GroupMaxDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output);
-
-  /// Set each element to the tanh of the corresponding element of "src".
-  void Tanh(const MatrixBase<Real> &src);
-
-  // Function used in backpropagating derivatives of the sigmoid function:
-  // element-by-element, set *this = diff * value * (1.0 - value).
-  void DiffSigmoid(const MatrixBase<Real> &value,
-                   const MatrixBase<Real> &diff);
-
-  // Function used in backpropagating derivatives of the tanh function:
-  // element-by-element, set *this = diff * (1.0 - value^2).
-  void DiffTanh(const MatrixBase<Real> &value,
-                const MatrixBase<Real> &diff);
-*/
-  /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
-   * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
-   * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
-   * positive semi-definite (check_thresh controls how stringent the check is;
-   * set it to 2 to ensure it won't ever complain, but it will zero out negative
-   * dimensions in your matrix.
-   *
-   * Caution: if you want the eigenvalues, it may make more sense to convert to
-   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
-   * directly rather than SVD.
-  */
+    /// The Power method attempts to take the matrix to a power using a method
+    /// that
+    /// works in general for fractional and negative powers.  The input matrix
+    /// must
+    /// be invertible and have reasonable condition (or we don't guarantee the
+    /// results.  The method is based on the eigenvalue decomposition.  It will
+    /// return false and leave the matrix unchanged, if at entry the matrix had
+    /// real negative eigenvalues (or if it had zero eigenvalues and the power
+    /// was
+    /// negative).
+    //  bool Power(Real pow);
+
+    /** Singular value decomposition
+       Major limitations:
+       For nonsquare matrices, we assume m>=n (NumRows >= NumCols), and we
+       return
+       the "skinny" Svd, i.e. the matrix in the middle is diagonal, and the
+       one on the left is rectangular.
+
+       In Svd, *this = U*diag(S)*Vt.
+       Null pointers for U and/or Vt at input mean we do not want that output.
+       We
+       expect that S.Dim() == m, U is either NULL or m by n,
+       and v is either NULL or n by n.
+       The singular values are not sorted (use SortSvd for that).  */
+    // void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+    //                   MatrixBase<Real> *Vt);  // Destroys calling matrix.
+
+    /// Compute SVD (*this) = U diag(s) Vt.   Note that the V in the call is
+    /// already
+    /// transposed; the normal formulation is U diag(s) V^T.
+    /// Null pointers for U or V mean we don't want that output (this saves
+    /// compute).  The singular values are not sorted (use SortSvd for that).
+    // void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
+    //        MatrixBase<Real> *Vt) const;
+    /// Compute SVD but only retain the singular values.
+    // void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
+
+
+    /// Returns smallest singular value.
+    // Real MinSingularValue() const {
+    // Vector<Real> tmp(std::min(NumRows(), NumCols()));
+    // Svd(&tmp);
+    // return tmp.Min();
+    //}
 
-  /// stream read.
-  /// Use instead of stream<<*this, if you want to add to existing contents.
-  // Will throw exception on failure.
-  void Read(std::istream & in, bool binary);
-  /// write to stream.
-  void Write(std::ostream & out, bool binary) const;
-
-  // Below is internal methods for Svd, user does not have to know about this.
- protected:
-
-  ///  Initializer, callable only from child.
-  explicit MatrixBase(Real *data, MatrixIndexT cols, MatrixIndexT rows, MatrixIndexT stride) :
-    data_(data), num_cols_(cols), num_rows_(rows), stride_(stride) {
-    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
-  }
-
-  ///  Initializer, callable only from child.
-  /// Empty initializer, for un-initialized matrix.
-  explicit MatrixBase(): data_(NULL) {
-    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
-  }
-
-  // Make sure pointers to MatrixBase cannot be deleted.
-  ~MatrixBase() { }
-
-  /// A workaround that allows SubMatrix to get a pointer to non-const data
-  /// for const Matrix. Unfortunately C++ does not allow us to declare a
-  /// "public const" inheritance or anything like that, so it would require
-  /// a lot of work to make the SubMatrix class totally const-correct--
-  /// we would have to override many of the Matrix functions.
-  inline Real*  Data_workaround() const {
-    return data_;
-  }
-
-  /// data memory area
-  Real*   data_;
-
-  /// these attributes store the real matrix size as it is stored in memory
-  /// including memalignment
-  MatrixIndexT    num_cols_;   /// < Number of columns
-  MatrixIndexT    num_rows_;   /// < Number of rows
-  /** True number of columns for the internal matrix. This number may differ
-   * from num_cols_ as memory alignment might be used. */
-  MatrixIndexT    stride_;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase);
+    // void TestUninitialized() const; // This function is designed so that if
+    // any element
+    // if the matrix is uninitialized memory, valgrind will complain.
+
+    /// Returns condition number by computing Svd.  Works even if cols > rows.
+    /// Returns infinity if all singular values are zero.
+    /*
+    Real Cond() const;
+
+    /// Returns true if matrix is Symmetric.
+    bool IsSymmetric(Real cutoff = 1.0e-05) const;  // replace magic number
+
+    /// Returns true if matrix is Diagonal.
+    bool IsDiagonal(Real cutoff = 1.0e-05) const;  // replace magic number
+
+    /// Returns true if the matrix is all zeros, except for ones on diagonal.
+    (it
+    /// does not have to be square).  More specifically, this function returns
+    /// false if for any i, j, (*this)(i, j) differs by more than cutoff from
+    the
+    /// expression (i == j ? 1 : 0).
+    bool IsUnit(Real cutoff = 1.0e-05) const;     // replace magic number
+
+    /// Returns true if matrix is all zeros.
+    bool IsZero(Real cutoff = 1.0e-05) const;     // replace magic number
+
+    /// Frobenius norm, which is the sqrt of sum of square elements.  Same as
+    Schatten 2-norm,
+    /// or just "2-norm".
+    Real FrobeniusNorm() const;
+
+    /// Returns true if ((*this)-other).FrobeniusNorm()
+    /// <= tol * (*this).FrobeniusNorm().
+    bool ApproxEqual(const MatrixBase<Real> &other, float tol = 0.01) const;
+
+    /// Tests for exact equality.  It's usually preferable to use ApproxEqual.
+    bool Equal(const MatrixBase<Real> &other) const;
+
+    /// largest absolute value.
+    Real LargestAbsElem() const;  // largest absolute value.
+
+    /// Returns log(sum(exp())) without exp overflow
+    /// If prune > 0.0, it uses a pruning beam, discarding
+    /// terms less than (max - prune).  Note: in future
+    /// we may change this so that if prune = 0.0, it takes
+    /// the max, so use -1 if you don't want to prune.
+    Real LogSumExp(Real prune = -1.0) const;
+
+    /// Apply soft-max to the collection of all elements of the
+    /// matrix and return normalizer (log sum of exponentials).
+    Real ApplySoftMax();
+
+    /// Set each element to the sigmoid of the corresponding element of "src".
+    void Sigmoid(const MatrixBase<Real> &src);
+
+    /// Sets each element to the Heaviside step function (x > 0 ? 1 : 0) of the
+    /// corresponding element in "src".  Note: in general you can make different
+    /// choices for x = 0, but for now please leave it as it (i.e. returning
+    zero)
+    /// because it affects the RectifiedLinearComponent in the neural net code.
+    void Heaviside(const MatrixBase<Real> &src);
+
+    void Exp(const MatrixBase<Real> &src);
+
+    void Pow(const MatrixBase<Real> &src, Real power);
+
+    void Log(const MatrixBase<Real> &src);
+
+    /// Apply power to the absolute value of each element.
+    /// If include_sign is true, the result will be multiplied with
+    /// the sign of the input value.
+    /// If the power is negative and the input to the power is zero,
+    /// The output will be set zero. If include_sign is true, it will
+    /// multiply the result by the sign of the input.
+    void PowAbs(const MatrixBase<Real> &src, Real power, bool
+    include_sign=false);
+
+    void Floor(const MatrixBase<Real> &src, Real floor_val);
+
+    void Ceiling(const MatrixBase<Real> &src, Real ceiling_val);
+
+    /// For each element x of the matrix, set it to
+    /// (x < 0 ? exp(x) : x + 1).  This function is used
+    /// in our RNNLM training.
+    void ExpSpecial(const MatrixBase<Real> &src);
+
+    /// This is equivalent to running:
+    /// Floor(src, lower_limit);
+    /// Ceiling(src, upper_limit);
+    /// Exp(src)
+    void ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real
+    upper_limit);
+
+    /// Set each element to y = log(1 + exp(x))
+    void SoftHinge(const MatrixBase<Real> &src);
+
+    /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 /
+    p).
+    /// Requires src.NumRows() == this->NumRows() and  src.NumCols() %
+    this->NumCols() == 0.
+    void GroupPnorm(const MatrixBase<Real> &src, Real power);
+
+    /// Calculate derivatives for the GroupPnorm function above...
+    /// if "input" is the input to the GroupPnorm function above (i.e. the "src"
+    variable),
+    /// and "output" is the result of the computation (i.e. the "this" of that
+    function
+    /// call), and *this has the same dimension as "input", then it sets each
+    element
+    /// of *this to the derivative d(output-elem)/d(input-elem) for each element
+    of "input", where
+    /// "output-elem" is whichever element of output depends on that input
+    element.
+    void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real>
+    &output,
+                         Real power);
+
+    /// Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j
+    /// Requires src.NumRows() == this->NumRows() and  src.NumCols() %
+    this->NumCols() == 0.
+    void GroupMax(const MatrixBase<Real> &src);
+
+    /// Calculate derivatives for the GroupMax function above, where
+    /// "input" is the input to the GroupMax function above (i.e. the "src"
+    variable),
+    /// and "output" is the result of the computation (i.e. the "this" of that
+    function
+    /// call), and *this must have the same dimension as "input". Each element
+    /// of *this will be set to 1 if the corresponding input equals the output
+    of
+    /// the group, and 0 otherwise. The equals the function derivative where it
+    is
+    /// defined (it's not defined where multiple inputs in the group are equal
+    to the output).
+    void GroupMaxDeriv(const MatrixBase<Real> &input, const MatrixBase<Real>
+    &output);
+
+    /// Set each element to the tanh of the corresponding element of "src".
+    void Tanh(const MatrixBase<Real> &src);
+
+    // Function used in backpropagating derivatives of the sigmoid function:
+    // element-by-element, set *this = diff * value * (1.0 - value).
+    void DiffSigmoid(const MatrixBase<Real> &value,
+                     const MatrixBase<Real> &diff);
+
+    // Function used in backpropagating derivatives of the tanh function:
+    // element-by-element, set *this = diff * (1.0 - value^2).
+    void DiffTanh(const MatrixBase<Real> &value,
+                  const MatrixBase<Real> &diff);
+  */
+    /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
+     * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
+     * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
+     * positive semi-definite (check_thresh controls how stringent the check is;
+     * set it to 2 to ensure it won't ever complain, but it will zero out
+     * negative
+     * dimensions in your matrix.
+     *
+     * Caution: if you want the eigenvalues, it may make more sense to convert
+     * to
+     * SpMatrix and use Eig() function there, which uses eigenvalue
+     * decomposition
+     * directly rather than SVD.
+    */
+
+    /// stream read.
+    /// Use instead of stream<<*this, if you want to add to existing contents.
+    // Will throw exception on failure.
+    void Read(std::istream &in, bool binary);
+    /// write to stream.
+    void Write(std::ostream &out, bool binary) const;
+
+    // Below is internal methods for Svd, user does not have to know about this.
+  protected:
+    ///  Initializer, callable only from child.
+    explicit MatrixBase(Real *data,
+                        MatrixIndexT cols,
+                        MatrixIndexT rows,
+                        MatrixIndexT stride)
+        : data_(data), num_cols_(cols), num_rows_(rows), stride_(stride) {
+        KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+    }
+
+    ///  Initializer, callable only from child.
+    /// Empty initializer, for un-initialized matrix.
+    explicit MatrixBase() : data_(NULL) { KALDI_ASSERT_IS_FLOATING_TYPE(Real); }
+
+    // Make sure pointers to MatrixBase cannot be deleted.
+    ~MatrixBase() {}
+
+    /// A workaround that allows SubMatrix to get a pointer to non-const data
+    /// for const Matrix. Unfortunately C++ does not allow us to declare a
+    /// "public const" inheritance or anything like that, so it would require
+    /// a lot of work to make the SubMatrix class totally const-correct--
+    /// we would have to override many of the Matrix functions.
+    inline Real *Data_workaround() const { return data_; }
+
+    /// data memory area
+    Real *data_;
+
+    /// these attributes store the real matrix size as it is stored in memory
+    /// including memalignment
+    MatrixIndexT num_cols_;  /// < Number of columns
+    MatrixIndexT num_rows_;  /// < Number of rows
+    /** True number of columns for the internal matrix. This number may differ
+     * from num_cols_ as memory alignment might be used. */
+    MatrixIndexT stride_;
+
+  private:
+    KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase);
 };
 
 /// A class for storing matrices.
-template<typename Real>
+template <typename Real>
 class Matrix : public MatrixBase<Real> {
- public:
-
-  /// Empty constructor.
-  Matrix();
-
-  /// Basic constructor.
-  Matrix(const MatrixIndexT r, const MatrixIndexT c,
-         MatrixResizeType resize_type = kSetZero,
-         MatrixStrideType stride_type = kDefaultStride):
-      MatrixBase<Real>() { Resize(r, c, resize_type, stride_type); }
-
-  /// Swaps the contents of *this and *other.  Shallow swap.
-  void Swap(Matrix<Real> *other);
-
-  /// Constructor from any MatrixBase. Can also copy with transpose.
-  /// Allocates new memory.
-  explicit Matrix(const MatrixBase<Real> & M,
-                  MatrixTransposeType trans = kNoTrans);
+  public:
+    /// Empty constructor.
+    Matrix();
+
+    /// Basic constructor.
+    Matrix(const MatrixIndexT r,
+           const MatrixIndexT c,
+           MatrixResizeType resize_type = kSetZero,
+           MatrixStrideType stride_type = kDefaultStride)
+        : MatrixBase<Real>() {
+        Resize(r, c, resize_type, stride_type);
+    }
+
+    /// Swaps the contents of *this and *other.  Shallow swap.
+    void Swap(Matrix<Real> *other);
+
+    /// Constructor from any MatrixBase. Can also copy with transpose.
+    /// Allocates new memory.
+    explicit Matrix(const MatrixBase<Real> &M,
+                    MatrixTransposeType trans = kNoTrans);
 
-  /// Same as above, but need to avoid default copy constructor.
-  Matrix(const Matrix<Real> & M);  //  (cannot make explicit)
+    /// Same as above, but need to avoid default copy constructor.
+    Matrix(const Matrix<Real> &M);  //  (cannot make explicit)
 
-  /// Copy constructor: as above, but from another type.
-  template<typename OtherReal>
-  explicit Matrix(const MatrixBase<OtherReal> & M,
+    /// Copy constructor: as above, but from another type.
+    template <typename OtherReal>
+    explicit Matrix(const MatrixBase<OtherReal> &M,
                     MatrixTransposeType trans = kNoTrans);
 
-  /// Copy constructor taking TpMatrix...
-  //template <typename OtherReal>
-  //explicit Matrix(const TpMatrix<OtherReal> & M,
-                  //MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
-    //if (trans == kNoTrans) {
-      //Resize(M.NumRows(), M.NumCols(), kUndefined);
-      //this->CopyFromTp(M);
+    /// Copy constructor taking TpMatrix...
+    // template <typename OtherReal>
+    // explicit Matrix(const TpMatrix<OtherReal> & M,
+    // MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
+    // if (trans == kNoTrans) {
+    // Resize(M.NumRows(), M.NumCols(), kUndefined);
+    // this->CopyFromTp(M);
     //} else {
-      //Resize(M.NumCols(), M.NumRows(), kUndefined);
-      //this->CopyFromTp(M, kTrans);
+    // Resize(M.NumCols(), M.NumRows(), kUndefined);
+    // this->CopyFromTp(M, kTrans);
+    //}
     //}
-  //}
-
-  /// read from stream.
-  // Unlike one in base, allows resizing.
-  void Read(std::istream & in, bool binary);
-
-  /// Remove a specified row.
-  void RemoveRow(MatrixIndexT i);
-
-  /// Transpose the matrix.  Works for non-square
-  /// matrices as well as square ones.
-  //void Transpose();
-
-  /// Distructor to free matrices.
-  ~Matrix() { Destroy(); }
-
-  /// Sets matrix to a specified size (zero is OK as long as both r and c are
-  /// zero).  The value of the new data depends on resize_type:
-  ///   -if kSetZero, the new data will be zero
-  ///   -if kUndefined, the new data will be undefined
-  ///   -if kCopyData, the new data will be the same as the old data in any
-  ///      shared positions, and zero elsewhere.
-  ///
-  /// You can set stride_type to kStrideEqualNumCols to force the stride
-  /// to equal the number of columns; by default it is set so that the stride
-  /// in bytes is a multiple of 16.
-  ///
-  /// This function takes time proportional to the number of data elements.
-  void Resize(const MatrixIndexT r,
-              const MatrixIndexT c,
-              MatrixResizeType resize_type = kSetZero,
-              MatrixStrideType stride_type = kDefaultStride);
-
-  /// Assignment operator that takes MatrixBase.
-  Matrix<Real> &operator = (const MatrixBase<Real> &other) {
-    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols())
-      Resize(other.NumRows(), other.NumCols(), kUndefined);
-    MatrixBase<Real>::CopyFromMat(other);
-    return *this;
-  }
-
-  /// Assignment operator. Needed for inclusion in std::vector.
-  Matrix<Real> &operator = (const Matrix<Real> &other) {
-    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols())
-      Resize(other.NumRows(), other.NumCols(), kUndefined);
-    MatrixBase<Real>::CopyFromMat(other);
-    return *this;
-  }
-
-
- private:
-  /// Deallocates memory and sets to empty matrix (dimension 0, 0).
-  void Destroy();
-
-  /// Init assumes the current class contents are invalid (i.e. junk or have
-  /// already been freed), and it sets the matrix to newly allocated memory with
-  /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
-  /// memory contents will be undefined.
-  void Init(const MatrixIndexT r,
-            const MatrixIndexT c,
-            const MatrixStrideType stride_type);
 
+    /// read from stream.
+    // Unlike one in base, allows resizing.
+    void Read(std::istream &in, bool binary);
+
+    /// Remove a specified row.
+    void RemoveRow(MatrixIndexT i);
+
+    /// Transpose the matrix.  Works for non-square
+    /// matrices as well as square ones.
+    // void Transpose();
+
+    /// Distructor to free matrices.
+    ~Matrix() { Destroy(); }
+
+    /// Sets matrix to a specified size (zero is OK as long as both r and c are
+    /// zero).  The value of the new data depends on resize_type:
+    ///   -if kSetZero, the new data will be zero
+    ///   -if kUndefined, the new data will be undefined
+    ///   -if kCopyData, the new data will be the same as the old data in any
+    ///      shared positions, and zero elsewhere.
+    ///
+    /// You can set stride_type to kStrideEqualNumCols to force the stride
+    /// to equal the number of columns; by default it is set so that the stride
+    /// in bytes is a multiple of 16.
+    ///
+    /// This function takes time proportional to the number of data elements.
+    void Resize(const MatrixIndexT r,
+                const MatrixIndexT c,
+                MatrixResizeType resize_type = kSetZero,
+                MatrixStrideType stride_type = kDefaultStride);
+
+    /// Assignment operator that takes MatrixBase.
+    Matrix<Real> &operator=(const MatrixBase<Real> &other) {
+        if (MatrixBase<Real>::NumRows() != other.NumRows() ||
+            MatrixBase<Real>::NumCols() != other.NumCols())
+            Resize(other.NumRows(), other.NumCols(), kUndefined);
+        MatrixBase<Real>::CopyFromMat(other);
+        return *this;
+    }
+
+    /// Assignment operator. Needed for inclusion in std::vector.
+    Matrix<Real> &operator=(const Matrix<Real> &other) {
+        if (MatrixBase<Real>::NumRows() != other.NumRows() ||
+            MatrixBase<Real>::NumCols() != other.NumCols())
+            Resize(other.NumRows(), other.NumCols(), kUndefined);
+        MatrixBase<Real>::CopyFromMat(other);
+        return *this;
+    }
+
+
+  private:
+    /// Deallocates memory and sets to empty matrix (dimension 0, 0).
+    void Destroy();
+
+    /// Init assumes the current class contents are invalid (i.e. junk or have
+    /// already been freed), and it sets the matrix to newly allocated memory
+    /// with
+    /// the specified number of rows and columns.  r == c == 0 is acceptable.
+    /// The data
+    /// memory contents will be undefined.
+    void Init(const MatrixIndexT r,
+              const MatrixIndexT c,
+              const MatrixStrideType stride_type);
 };
 /// @} end "addtogroup matrix_group"
 
@@ -710,38 +756,38 @@ class Matrix : public MatrixBase<Real> {
 /// A structure containing the HTK header.
 /// [TODO: change the style of the variables to Kaldi-compliant]
 
-template<typename Real>
+template <typename Real>
 class SubMatrix : public MatrixBase<Real> {
- public:
-  // Initialize a SubMatrix from part of a matrix; this is
-  // a bit like A(b:c, d:e) in Matlab.
-  // This initializer is against the proper semantics of "const", since
-  // SubMatrix can change its contents.  It would be hard to implement
-  // a "const-safe" version of this class.
-  SubMatrix(const MatrixBase<Real>& T,
-            const MatrixIndexT ro,  // row offset, 0 < ro < NumRows()
-            const MatrixIndexT r,   // number of rows, r > 0
-            const MatrixIndexT co,  // column offset, 0 < co < NumCols()
-            const MatrixIndexT c);   // number of columns, c > 0
-
-  // This initializer is mostly intended for use in CuMatrix and related
-  // classes.  Be careful!
-  SubMatrix(Real *data,
-            MatrixIndexT num_rows,
-            MatrixIndexT num_cols,
-            MatrixIndexT stride);
-
-  ~SubMatrix<Real>() {}
-
-  /// This type of constructor is needed for Range() to work [in Matrix base
-  /// class]. Cannot make it explicit.
-  SubMatrix<Real> (const SubMatrix &other):
-  MatrixBase<Real> (other.data_, other.num_cols_, other.num_rows_,
-                    other.stride_) {}
-
- private:
-  /// Disallow assignment.
-  SubMatrix<Real> &operator = (const SubMatrix<Real> &other);
+  public:
+    // Initialize a SubMatrix from part of a matrix; this is
+    // a bit like A(b:c, d:e) in Matlab.
+    // This initializer is against the proper semantics of "const", since
+    // SubMatrix can change its contents.  It would be hard to implement
+    // a "const-safe" version of this class.
+    SubMatrix(const MatrixBase<Real> &T,
+              const MatrixIndexT ro,  // row offset, 0 < ro < NumRows()
+              const MatrixIndexT r,   // number of rows, r > 0
+              const MatrixIndexT co,  // column offset, 0 < co < NumCols()
+              const MatrixIndexT c);  // number of columns, c > 0
+
+    // This initializer is mostly intended for use in CuMatrix and related
+    // classes.  Be careful!
+    SubMatrix(Real *data,
+              MatrixIndexT num_rows,
+              MatrixIndexT num_cols,
+              MatrixIndexT stride);
+
+    ~SubMatrix<Real>() {}
+
+    /// This type of constructor is needed for Range() to work [in Matrix base
+    /// class]. Cannot make it explicit.
+    SubMatrix<Real>(const SubMatrix &other)
+        : MatrixBase<Real>(
+              other.data_, other.num_cols_, other.num_rows_, other.stride_) {}
+
+  private:
+    /// Disallow assignment.
+    SubMatrix<Real> &operator=(const SubMatrix<Real> &other);
 };
 
 /// @} End of "addtogroup matrix_funcs_io".
@@ -794,25 +840,33 @@ Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
 /// the same as U->NumCols(), and we sort s from greatest to least absolute
 /// value (if sort_on_absolute_value == true) or greatest to least value
 /// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it
-/// exists, around in the same way.  Note: the "absolute value" part won't matter
+/// exists, around in the same way.  Note: the "absolute value" part won't
+matter
 /// if this is an actual SVD, since singular values are non-negative.
 template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                                      MatrixBase<Real>* Vt = NULL,
                                      bool sort_on_absolute_value = true);
 
-/// Creates the eigenvalue matrix D that is part of the decomposition used Matrix::Eig.
+/// Creates the eigenvalue matrix D that is part of the decomposition used
+Matrix::Eig.
 /// D will be block-diagonal with blocks of size 1 (for real eigenvalues) or 2x2
-/// for complex pairs.  If a complex pair is lambda +- i*mu, D will have a corresponding
+/// for complex pairs.  If a complex pair is lambda +- i*mu, D will have a
+corresponding
 /// 2x2 block [lambda, mu; -mu, lambda].
-/// This function will throw if any complex eigenvalues are not in complex conjugate
+/// This function will throw if any complex eigenvalues are not in complex
+conjugate
 /// pairs (or the members of such pairs are not consecutively numbered).
 template<typename Real>
-void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real> &imag,
+void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
+&imag,
                             MatrixBase<Real> *D);
 
-/// The following function is used in Matrix::Power, and separately tested, so we
-/// declare it here mainly for the testing code to see.  It takes a complex value to
-/// a power using a method that will work for noninteger powers (but will fail if the
+/// The following function is used in Matrix::Power, and separately tested, so
+we
+/// declare it here mainly for the testing code to see.  It takes a complex
+value to
+/// a power using a method that will work for noninteger powers (but will fail
+if the
 /// complex value is real and negative).
 template<typename Real>
 bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
@@ -823,19 +877,19 @@ bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
 
 /// \addtogroup matrix_funcs_io
 /// @{
-template<typename Real>
-std::ostream & operator << (std::ostream & Out, const MatrixBase<Real> & M);
+template <typename Real>
+std::ostream &operator<<(std::ostream &Out, const MatrixBase<Real> &M);
 
-template<typename Real>
-std::istream & operator >> (std::istream & In, MatrixBase<Real> & M);
+template <typename Real>
+std::istream &operator>>(std::istream &In, MatrixBase<Real> &M);
 
 // The Matrix read allows resizing, so we override the MatrixBase one.
-template<typename Real>
-std::istream & operator >> (std::istream & In, Matrix<Real> & M);
+template <typename Real>
+std::istream &operator>>(std::istream &In, Matrix<Real> &M);
 
-template<typename Real>
+template <typename Real>
 bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
-  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
+    return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
 }
 
 /// @} end of \addtogroup matrix_funcs_io
@@ -844,7 +898,6 @@ bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
 }  // namespace kaldi
 
 
-
 // we need to include the implementation and some
 // template specializations.
 #include "matrix/kaldi-matrix-inl.h"
diff --git a/runtime/engine/common/matrix/kaldi-vector-inl.h b/runtime/engine/common/matrix/kaldi-vector-inl.h
index 826202764..b3075e590 100644
--- a/runtime/engine/common/matrix/kaldi-vector-inl.h
+++ b/runtime/engine/common/matrix/kaldi-vector-inl.h
@@ -26,32 +26,33 @@
 
 namespace kaldi {
 
-template<typename Real>
-std::ostream & operator << (std::ostream &os, const VectorBase<Real> &rv) {
-  rv.Write(os, false);
-  return os;
+template <typename Real>
+std::ostream &operator<<(std::ostream &os, const VectorBase<Real> &rv) {
+    rv.Write(os, false);
+    return os;
 }
 
-template<typename Real>
-std::istream &operator >> (std::istream &is, VectorBase<Real> &rv) {
-  rv.Read(is, false);
-  return is;
+template <typename Real>
+std::istream &operator>>(std::istream &is, VectorBase<Real> &rv) {
+    rv.Read(is, false);
+    return is;
 }
 
-template<typename Real>
-std::istream &operator >> (std::istream &is, Vector<Real> &rv) {
-  rv.Read(is, false);
-  return is;
+template <typename Real>
+std::istream &operator>>(std::istream &is, Vector<Real> &rv) {
+    rv.Read(is, false);
+    return is;
 }
 
-//template<>
-//template<>
-//void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
+// template<>
+// template<>
+// void VectorBase<float>::AddVec(const float alpha, const VectorBase<float>
+// &rv);
 
-//template<>
-//template<>
-//void VectorBase<double>::AddVec<double>(const double alpha,
-                                        //const VectorBase<double> &rv);
+// template<>
+// template<>
+// void VectorBase<double>::AddVec<double>(const double alpha,
+// const VectorBase<double> &rv);
 
 }  // namespace kaldi
 
diff --git a/runtime/engine/common/matrix/kaldi-vector.cc b/runtime/engine/common/matrix/kaldi-vector.cc
index 9f2bd08e6..90817f0d4 100644
--- a/runtime/engine/common/matrix/kaldi-vector.cc
+++ b/runtime/engine/common/matrix/kaldi-vector.cc
@@ -23,80 +23,85 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include "matrix/kaldi-vector.h"
 #include <algorithm>
 #include <string>
-#include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 
 namespace kaldi {
 
-template<typename Real>
+template <typename Real>
 inline void Vector<Real>::Init(const MatrixIndexT dim) {
-  KALDI_ASSERT(dim >= 0);
-  if (dim == 0) {
-    this->dim_ = 0;
-    this->data_ = NULL;
-    return;
-  }
-  MatrixIndexT size;
-  void *data;
-  void *free_data;
+    KALDI_ASSERT(dim >= 0);
+    if (dim == 0) {
+        this->dim_ = 0;
+        this->data_ = NULL;
+        return;
+    }
+    MatrixIndexT size;
+    void *data;
+    void *free_data;
 
-  size = dim * sizeof(Real);
+    size = dim * sizeof(Real);
 
-  if ((data = KALDI_MEMALIGN(16, size, &free_data)) != NULL) {
-    this->data_ = static_cast<Real*> (data);
-    this->dim_ = dim;
-  } else {
-    throw std::bad_alloc();
-  }
+    if ((data = KALDI_MEMALIGN(16, size, &free_data)) != NULL) {
+        this->data_ = static_cast<Real *>(data);
+        this->dim_ = dim;
+    } else {
+        throw std::bad_alloc();
+    }
 }
 
 
-template<typename Real>
-void Vector<Real>::Resize(const MatrixIndexT dim, MatrixResizeType resize_type) {
-
-  // the next block uses recursion to handle what we have to do if
-  // resize_type == kCopyData.
-  if (resize_type == kCopyData) {
-    if (this->data_ == NULL || dim == 0) resize_type = kSetZero;  // nothing to copy.
-    else if (this->dim_ == dim) { return; } // nothing to do.
-    else {
-      // set tmp to a vector of the desired size.
-      Vector<Real> tmp(dim, kUndefined);
-      if (dim > this->dim_) {
-        memcpy(tmp.data_, this->data_, sizeof(Real)*this->dim_);
-        memset(tmp.data_+this->dim_, 0, sizeof(Real)*(dim-this->dim_));
-      } else {
-        memcpy(tmp.data_, this->data_, sizeof(Real)*dim);
-      }
-      tmp.Swap(this);
-      // and now let tmp go out of scope, deleting what was in *this.
-      return;
+template <typename Real>
+void Vector<Real>::Resize(const MatrixIndexT dim,
+                          MatrixResizeType resize_type) {
+    // the next block uses recursion to handle what we have to do if
+    // resize_type == kCopyData.
+    if (resize_type == kCopyData) {
+        if (this->data_ == NULL || dim == 0)
+            resize_type = kSetZero;  // nothing to copy.
+        else if (this->dim_ == dim) {
+            return;
+        }  // nothing to do.
+        else {
+            // set tmp to a vector of the desired size.
+            Vector<Real> tmp(dim, kUndefined);
+            if (dim > this->dim_) {
+                memcpy(tmp.data_, this->data_, sizeof(Real) * this->dim_);
+                memset(tmp.data_ + this->dim_,
+                       0,
+                       sizeof(Real) * (dim - this->dim_));
+            } else {
+                memcpy(tmp.data_, this->data_, sizeof(Real) * dim);
+            }
+            tmp.Swap(this);
+            // and now let tmp go out of scope, deleting what was in *this.
+            return;
+        }
     }
-  }
-  // At this point, resize_type == kSetZero or kUndefined.
+    // At this point, resize_type == kSetZero or kUndefined.
 
-  if (this->data_ != NULL) {
-    if (this->dim_ == dim) {
-      if (resize_type == kSetZero) this->SetZero();
-      return;
-    } else {
-      Destroy();
+    if (this->data_ != NULL) {
+        if (this->dim_ == dim) {
+            if (resize_type == kSetZero) this->SetZero();
+            return;
+        } else {
+            Destroy();
+        }
     }
-  }
-  Init(dim);
-  if (resize_type == kSetZero) this->SetZero();
+    Init(dim);
+    if (resize_type == kSetZero) this->SetZero();
 }
 
 
 /// Copy data from another vector
-template<typename Real>
+template <typename Real>
 void VectorBase<Real>::CopyFromVec(const VectorBase<Real> &v) {
-  KALDI_ASSERT(Dim() == v.Dim());
-  if (data_ != v.data_) {
-    std::memcpy(this->data_, v.data_, dim_ * sizeof(Real));
-  }
+    KALDI_ASSERT(Dim() == v.Dim());
+    if (data_ != v.data_) {
+        std::memcpy(this->data_, v.data_, dim_ * sizeof(Real));
+    }
 }
 
 /*
@@ -107,10 +112,14 @@ void VectorBase<Real>::CopyFromPacked(const PackedMatrix<OtherReal>& M) {
   this->CopyFromVec(v);
 }
 // instantiate the template.
-template void VectorBase<float>::CopyFromPacked(const PackedMatrix<double> &other);
-template void VectorBase<float>::CopyFromPacked(const PackedMatrix<float> &other);
-template void VectorBase<double>::CopyFromPacked(const PackedMatrix<double> &other);
-template void VectorBase<double>::CopyFromPacked(const PackedMatrix<float> &other);
+template void VectorBase<float>::CopyFromPacked(const PackedMatrix<double>
+&other);
+template void VectorBase<float>::CopyFromPacked(const PackedMatrix<float>
+&other);
+template void VectorBase<double>::CopyFromPacked(const PackedMatrix<double>
+&other);
+template void VectorBase<double>::CopyFromPacked(const PackedMatrix<float>
+&other);
 
 /// Load data into the vector
 template<typename Real>
@@ -119,50 +128,48 @@ void VectorBase<Real>::CopyFromPtr(const Real *data, MatrixIndexT sz) {
   std::memcpy(this->data_, data, Dim() * sizeof(Real));
 }*/
 
-template<typename Real>
-template<typename OtherReal>
+template <typename Real>
+template <typename OtherReal>
 void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
-  KALDI_ASSERT(dim_ == other.Dim());
-  Real * __restrict__  ptr = data_;
-  const OtherReal * __restrict__ other_ptr = other.Data();
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    ptr[i] = other_ptr[i];
+    KALDI_ASSERT(dim_ == other.Dim());
+    Real *__restrict__ ptr = data_;
+    const OtherReal *__restrict__ other_ptr = other.Data();
+    for (MatrixIndexT i = 0; i < dim_; i++) ptr[i] = other_ptr[i];
 }
 
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
 template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 
 // Remove element from the vector. The vector is not reallocated
-template<typename Real>
+template <typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
-  KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
-  for (MatrixIndexT j = i + 1; j <  this->dim_; j++)
-    this->data_[j-1] =  this->data_[j];
-  this->dim_--;
+    KALDI_ASSERT(i < this->dim_ && "Access out of vector");
+    for (MatrixIndexT j = i + 1; j < this->dim_; j++)
+        this->data_[j - 1] = this->data_[j];
+    this->dim_--;
 }
 
 
 /// Deallocates memory and sets object to empty vector.
-template<typename Real>
+template <typename Real>
 void Vector<Real>::Destroy() {
-  /// we need to free the data block if it was defined
-  if (this->data_ != NULL)
-    KALDI_MEMALIGN_FREE(this->data_);
-  this->data_ = NULL;
-  this->dim_ = 0;
+    /// we need to free the data block if it was defined
+    if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
+    this->data_ = NULL;
+    this->dim_ = 0;
 }
 
-template<typename Real>
+template <typename Real>
 void VectorBase<Real>::SetZero() {
-  std::memset(data_, 0, dim_ * sizeof(Real));
+    std::memset(data_, 0, dim_ * sizeof(Real));
 }
 
-template<typename Real>
+template <typename Real>
 bool VectorBase<Real>::IsZero(Real cutoff) const {
-  Real abs_max = 0.0;
-  for (MatrixIndexT i = 0; i < Dim(); i++)
-    abs_max = std::max(std::abs(data_[i]), abs_max);
-  return (abs_max <= cutoff);
+    Real abs_max = 0.0;
+    for (MatrixIndexT i = 0; i < Dim(); i++)
+        abs_max = std::max(std::abs(data_[i]), abs_max);
+    return (abs_max <= cutoff);
 }
 
 /*
@@ -201,104 +208,107 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
                    // returns exactly 1, or due to roundoff.
 }*/
 
-template<typename Real>
+template <typename Real>
 void VectorBase<Real>::Set(Real f) {
-  // Why not use memset here?
-  // The basic unit of memset is a byte.
-  // If f != 0 and sizeof(Real) > 1, then we cannot use memset.
-  if (f == 0) {
-    this->SetZero(); // calls std::memset
-  } else {
-    for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = f; }
-  }
+    // Why not use memset here?
+    // The basic unit of memset is a byte.
+    // If f != 0 and sizeof(Real) > 1, then we cannot use memset.
+    if (f == 0) {
+        this->SetZero();  // calls std::memset
+    } else {
+        for (MatrixIndexT i = 0; i < dim_; i++) {
+            data_[i] = f;
+        }
+    }
 }
 
-template<typename Real>
+template <typename Real>
 void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
-  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
+    KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 
-  Real *inc_data = data_;
-  const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows();
+    Real *inc_data = data_;
+    const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows();
 
-  if (mat.Stride() == mat.NumCols()) {
-    memcpy(inc_data, mat.Data(), cols*rows*sizeof(Real));
-  } else {
-    for (MatrixIndexT i = 0; i < rows; i++) {
-      // copy the data to the propper position
-      memcpy(inc_data, mat.RowData(i), cols * sizeof(Real));
-      // set new copy position
-      inc_data += cols;
+    if (mat.Stride() == mat.NumCols()) {
+        memcpy(inc_data, mat.Data(), cols * rows * sizeof(Real));
+    } else {
+        for (MatrixIndexT i = 0; i < rows; i++) {
+            // copy the data to the propper position
+            memcpy(inc_data, mat.RowData(i), cols * sizeof(Real));
+            // set new copy position
+            inc_data += cols;
+        }
     }
-  }
 }
 
-template<typename Real>
-template<typename OtherReal>
+template <typename Real>
+template <typename OtherReal>
 void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<OtherReal> &mat) {
-  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
-  Real *vec_data = data_;
-  const MatrixIndexT cols = mat.NumCols(),
-      rows = mat.NumRows();
-
-  for (MatrixIndexT i = 0; i < rows; i++) {
-    const OtherReal *mat_row = mat.RowData(i);
-    for (MatrixIndexT j = 0; j < cols; j++) {
-      vec_data[j] = static_cast<Real>(mat_row[j]);
+    KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
+    Real *vec_data = data_;
+    const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows();
+
+    for (MatrixIndexT i = 0; i < rows; i++) {
+        const OtherReal *mat_row = mat.RowData(i);
+        for (MatrixIndexT j = 0; j < cols; j++) {
+            vec_data[j] = static_cast<Real>(mat_row[j]);
+        }
+        vec_data += cols;
     }
-    vec_data += cols;
-  }
 }
 
-template
-void VectorBase<float>::CopyRowsFromMat(const MatrixBase<double> &mat);
-template
-void VectorBase<double>::CopyRowsFromMat(const MatrixBase<float> &mat);
+template void VectorBase<float>::CopyRowsFromMat(const MatrixBase<double> &mat);
+template void VectorBase<double>::CopyRowsFromMat(const MatrixBase<float> &mat);
 
 
-template<typename Real>
+template <typename Real>
 void VectorBase<Real>::CopyColsFromMat(const MatrixBase<Real> &mat) {
-  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
+    KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 
-  Real*       inc_data = data_;
-  const MatrixIndexT  cols     = mat.NumCols(), rows = mat.NumRows(), stride = mat.Stride();
-  const Real *mat_inc_data = mat.Data();
+    Real *inc_data = data_;
+    const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows(),
+                       stride = mat.Stride();
+    const Real *mat_inc_data = mat.Data();
 
-  for (MatrixIndexT i = 0; i < cols; i++) {
-    for (MatrixIndexT j = 0; j < rows; j++) {
-      inc_data[j] = mat_inc_data[j*stride];
+    for (MatrixIndexT i = 0; i < cols; i++) {
+        for (MatrixIndexT j = 0; j < rows; j++) {
+            inc_data[j] = mat_inc_data[j * stride];
+        }
+        mat_inc_data++;
+        inc_data += rows;
     }
-    mat_inc_data++;
-    inc_data += rows;
-  }
 }
 
-template<typename Real>
-void VectorBase<Real>::CopyRowFromMat(const MatrixBase<Real> &mat, MatrixIndexT row) {
-  KALDI_ASSERT(row < mat.NumRows());
-  KALDI_ASSERT(dim_ == mat.NumCols());
-  const Real *mat_row = mat.RowData(row);
-  memcpy(data_, mat_row, sizeof(Real)*dim_);
+template <typename Real>
+void VectorBase<Real>::CopyRowFromMat(const MatrixBase<Real> &mat,
+                                      MatrixIndexT row) {
+    KALDI_ASSERT(row < mat.NumRows());
+    KALDI_ASSERT(dim_ == mat.NumCols());
+    const Real *mat_row = mat.RowData(row);
+    memcpy(data_, mat_row, sizeof(Real) * dim_);
 }
 
-template<typename Real>
-template<typename OtherReal>
-void VectorBase<Real>::CopyRowFromMat(const MatrixBase<OtherReal> &mat, MatrixIndexT row) {
-  KALDI_ASSERT(row < mat.NumRows());
-  KALDI_ASSERT(dim_ == mat.NumCols());
-  const OtherReal *mat_row = mat.RowData(row);
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = static_cast<Real>(mat_row[i]);
+template <typename Real>
+template <typename OtherReal>
+void VectorBase<Real>::CopyRowFromMat(const MatrixBase<OtherReal> &mat,
+                                      MatrixIndexT row) {
+    KALDI_ASSERT(row < mat.NumRows());
+    KALDI_ASSERT(dim_ == mat.NumCols());
+    const OtherReal *mat_row = mat.RowData(row);
+    for (MatrixIndexT i = 0; i < dim_; i++)
+        data_[i] = static_cast<Real>(mat_row[i]);
 }
 
-template
-void VectorBase<float>::CopyRowFromMat(const MatrixBase<double> &mat, MatrixIndexT row);
-template
-void VectorBase<double>::CopyRowFromMat(const MatrixBase<float> &mat, MatrixIndexT row);
+template void VectorBase<float>::CopyRowFromMat(const MatrixBase<double> &mat,
+                                                MatrixIndexT row);
+template void VectorBase<double>::CopyRowFromMat(const MatrixBase<float> &mat,
+                                                 MatrixIndexT row);
 
 /*
 template<typename Real>
 template<typename OtherReal>
-void VectorBase<Real>::CopyRowFromSp(const SpMatrix<OtherReal> &sp, MatrixIndexT row) {
+void VectorBase<Real>::CopyRowFromSp(const SpMatrix<OtherReal> &sp, MatrixIndexT
+row) {
   KALDI_ASSERT(row < sp.NumRows());
   KALDI_ASSERT(dim_ == sp.NumCols());
 
@@ -313,13 +323,17 @@ void VectorBase<Real>::CopyRowFromSp(const SpMatrix<OtherReal> &sp, MatrixIndexT
 }
 
 template
-void VectorBase<float>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT row);
+void VectorBase<float>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT
+row);
 template
-void VectorBase<double>::CopyRowFromSp(const SpMatrix<float> &mat, MatrixIndexT row);
+void VectorBase<double>::CopyRowFromSp(const SpMatrix<float> &mat, MatrixIndexT
+row);
 template
-void VectorBase<float>::CopyRowFromSp(const SpMatrix<float> &mat, MatrixIndexT row);
+void VectorBase<float>::CopyRowFromSp(const SpMatrix<float> &mat, MatrixIndexT
+row);
 template
-void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT row);
+void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT
+row);
 
 // takes absolute value of the elements to a power.
 // Throws exception if could not (but only for power != 1 and power != 2).
@@ -333,7 +347,8 @@ void VectorBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
       data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * data_[i] * data_[i];
   } else if (power == 0.5) {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::sqrt(std::abs(data_[i]));
+      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) *
+std::sqrt(std::abs(data_[i]));
     }
   } else if (power < 0.0) {
     for (MatrixIndexT i = 0; i < dim_; i++) {
@@ -346,7 +361,8 @@ void VectorBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
     }
   } else {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * pow(std::abs(data_[i]), power);
+      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) *
+pow(std::abs(data_[i]), power);
       if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << "to power "
                   << power << ": returned value = " << data_[i];
@@ -401,7 +417,8 @@ Real VectorBase<Real>::Norm(Real p) const {
 }
 
 template<typename Real>
-bool VectorBase<Real>::ApproxEqual(const VectorBase<Real> &other, float tol) const {
+bool VectorBase<Real>::ApproxEqual(const VectorBase<Real> &other, float tol)
+const {
   if (dim_ != other.dim_) KALDI_ERR << "ApproxEqual: size mismatch "
                                     << dim_ << " vs. " << other.dim_;
   KALDI_ASSERT(tol >= 0.0);
@@ -499,677 +516,716 @@ Real VectorBase<Real>::Min(MatrixIndexT *index_out) const {
 }*/
 
 
-template<typename Real>
-template<typename OtherReal>
-void VectorBase<Real>::CopyColFromMat(const MatrixBase<OtherReal> &mat, MatrixIndexT col) {
-  KALDI_ASSERT(col < mat.NumCols());
-  KALDI_ASSERT(dim_ == mat.NumRows());
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = mat(i, col);
-  // can't do this very efficiently so don't really bother. could improve this though.
+template <typename Real>
+template <typename OtherReal>
+void VectorBase<Real>::CopyColFromMat(const MatrixBase<OtherReal> &mat,
+                                      MatrixIndexT col) {
+    KALDI_ASSERT(col < mat.NumCols());
+    KALDI_ASSERT(dim_ == mat.NumRows());
+    for (MatrixIndexT i = 0; i < dim_; i++) data_[i] = mat(i, col);
+    // can't do this very efficiently so don't really bother. could improve this
+    // though.
 }
 // instantiate the template above.
-template
-void VectorBase<float>::CopyColFromMat(const MatrixBase<float> &mat, MatrixIndexT col);
-template
-void VectorBase<float>::CopyColFromMat(const MatrixBase<double> &mat, MatrixIndexT col);
-template
-void VectorBase<double>::CopyColFromMat(const MatrixBase<float> &mat, MatrixIndexT col);
-template
-void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixIndexT col);
-
-//template<typename Real>
-//void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
-  //KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  //cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
-  //KALDI_ASSERT(dim_ == M.NumCols());
-  //for (MatrixIndexT i = 0; i < dim_; i++)
-    //data_[i] = M(i, i);
-  //// could make this more efficient.
-//}
-
-//template<typename Real>
-//Real VectorBase<Real>::Sum() const {
-  //// Do a dot-product with a size-1 array with a stride of 0 to
-  //// implement sum. This allows us to access SIMD operations in a
-  //// cross-platform way via your BLAS library.
-  //Real one(1);
-  //return cblas_Xdot(dim_, data_, 1, &one, 0);
-//}
-
-//template<typename Real>
-//Real VectorBase<Real>::SumLog() const {
-  //double sum_log = 0.0;
-  //double prod = 1.0;
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //prod *= data_[i];
-    //// Possible future work (arnab): change these magic values to pre-defined
-    //// constants
-    //if (prod < 1.0e-10 || prod > 1.0e+10) {
-      //sum_log += Log(prod);
-      //prod = 1.0;
-    //}
-  //}
-  //if (prod != 1.0) sum_log += Log(prod);
-  //return sum_log;
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
-  //KALDI_ASSERT(dim_ == M.NumCols());
-  //MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
-  //Real *data = data_;
-
-  //// implement the function according to a dimension cutoff for computation efficiency
-  //if (num_rows <= 64) {
-    //cblas_Xscal(dim, beta, data, 1);
-    //const Real *m_data = M.Data();
-    //for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-      //cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
-
-  //} else {
-    //Vector<Real> ones(M.NumRows());
-    //ones.Set(1.0);
-    //this->AddMatVec(alpha, M, kTrans, ones, beta);
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
-  //KALDI_ASSERT(dim_ == M.NumRows());
-  //MatrixIndexT num_cols = M.NumCols();
-
-  //// implement the function according to a dimension cutoff for computation efficiency
-  //if (num_cols <= 64) {
-    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      //double sum = 0.0;
-      //const Real *src = M.RowData(i);
-      //for (MatrixIndexT j = 0; j < num_cols; j++)
-        //sum += src[j];
-      //data_[i] = alpha * sum + beta * data_[i];
-    //}
-  //} else {
-    //Vector<Real> ones(M.NumCols());
-    //ones.Set(1.0);
-    //this->AddMatVec(alpha, M, kNoTrans, ones, beta);
-  //}
-//}
-
-//template<typename Real>
-//Real VectorBase<Real>::LogSumExp(Real prune) const {
-  //Real sum;
-  //if (sizeof(sum) == 8) sum = kLogZeroDouble;
-  //else sum = kLogZeroFloat;
-  //Real max_elem = Max(), cutoff;
-  //if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat;
-  //else cutoff = max_elem + kMinLogDiffDouble;
-  //if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning...
-    //cutoff = max_elem - prune;
-
-  //double sum_relto_max_elem = 0.0;
-
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //BaseFloat f = data_[i];
-    //if (f >= cutoff)
-      //sum_relto_max_elem += Exp(f - max_elem);
-  //}
-  //return max_elem + Log(sum_relto_max_elem);
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::InvertElements() {
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] = static_cast<Real>(1 / data_[i]);
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::ApplyLog() {
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //if (data_[i] < 0.0)
-      //KALDI_ERR << "Trying to take log of a negative number.";
-    //data_[i] = Log(data_[i]);
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::ApplyLogAndCopy(const VectorBase<Real> &v) {
-  //KALDI_ASSERT(dim_ == v.Dim());
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] = Log(v(i));
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::ApplyExp() {
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] = Exp(data_[i]);
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::ApplyAbs() {
-  //for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); }
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //if (floored_count == nullptr) {
-    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      //data_[i] = std::max(v.data_[i], floor_val);
-    //}
-  //} else {
-    //MatrixIndexT num_floored = 0;
-    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      //if (v.data_[i] < floor_val) {
-        //data_[i] = floor_val;
-        //num_floored++;
-      //} else {
-        //data_[i] = v.data_[i];
-      //}
-    //}
-    //*floored_count = num_floored;
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //if (ceiled_count == nullptr) {
-    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      //data_[i] = std::min(v.data_[i], ceil_val);
-    //}
-  //} else {
-    //MatrixIndexT num_changed = 0;
-    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      //if (v.data_[i] > ceil_val) {
-        //data_[i] = ceil_val;
-        //num_changed++;
-      //} else {
-        //data_[i] = v.data_[i];
-      //}
-    //}
-    //*ceiled_count = num_changed;
-  //}
-//}
-
-//template<typename Real>
-//MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
-  //KALDI_ASSERT(floor_vec.Dim() == dim_);
-  //MatrixIndexT num_floored = 0;
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //if (data_[i] < floor_vec(i)) {
-      //data_[i] = floor_vec(i);
-      //num_floored++;
-    //}
-  //}
-  //return num_floored;
-//}
-
-//template<typename Real>
-//Real VectorBase<Real>::ApplySoftMax() {
-  //Real max = this->Max(), sum = 0.0;
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //sum += (data_[i] = Exp(data_[i] - max));
-  //}
-  //this->Scale(1.0 / sum);
-  //return max + Log(sum);
-//}
-
-//template<typename Real>
-//Real VectorBase<Real>::ApplyLogSoftMax() {
-  //Real max = this->Max(), sum = 0.0;
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //sum += Exp((data_[i] -= max));
-  //}
-  //sum = Log(sum);
-  //this->Add(-1.0 * sum);
-  //return max + sum;
+template void VectorBase<float>::CopyColFromMat(const MatrixBase<float> &mat,
+                                                MatrixIndexT col);
+template void VectorBase<float>::CopyColFromMat(const MatrixBase<double> &mat,
+                                                MatrixIndexT col);
+template void VectorBase<double>::CopyColFromMat(const MatrixBase<float> &mat,
+                                                 MatrixIndexT col);
+template void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat,
+                                                 MatrixIndexT col);
+
+// template<typename Real>
+// void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
+// KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
+// cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
+// KALDI_ASSERT(dim_ == M.NumCols());
+// for (MatrixIndexT i = 0; i < dim_; i++)
+// data_[i] = M(i, i);
+//// could make this more efficient.
+//}
+
+// template<typename Real>
+// Real VectorBase<Real>::Sum() const {
+//// Do a dot-product with a size-1 array with a stride of 0 to
+//// implement sum. This allows us to access SIMD operations in a
+//// cross-platform way via your BLAS library.
+// Real one(1);
+// return cblas_Xdot(dim_, data_, 1, &one, 0);
+//}
+
+// template<typename Real>
+// Real VectorBase<Real>::SumLog() const {
+// double sum_log = 0.0;
+// double prod = 1.0;
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// prod *= data_[i];
+//// Possible future work (arnab): change these magic values to pre-defined
+//// constants
+// if (prod < 1.0e-10 || prod > 1.0e+10) {
+// sum_log += Log(prod);
+// prod = 1.0;
+//}
+//}
+// if (prod != 1.0) sum_log += Log(prod);
+// return sum_log;
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M,
+// Real beta) {
+// KALDI_ASSERT(dim_ == M.NumCols());
+// MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
+// Real *data = data_;
+
+//// implement the function according to a dimension cutoff for computation
+///efficiency
+// if (num_rows <= 64) {
+// cblas_Xscal(dim, beta, data, 1);
+// const Real *m_data = M.Data();
+// for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
+// cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
+
+//} else {
+// Vector<Real> ones(M.NumRows());
+// ones.Set(1.0);
+// this->AddMatVec(alpha, M, kTrans, ones, beta);
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M,
+// Real beta) {
+// KALDI_ASSERT(dim_ == M.NumRows());
+// MatrixIndexT num_cols = M.NumCols();
+
+//// implement the function according to a dimension cutoff for computation
+///efficiency
+// if (num_cols <= 64) {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// double sum = 0.0;
+// const Real *src = M.RowData(i);
+// for (MatrixIndexT j = 0; j < num_cols; j++)
+// sum += src[j];
+// data_[i] = alpha * sum + beta * data_[i];
+//}
+//} else {
+// Vector<Real> ones(M.NumCols());
+// ones.Set(1.0);
+// this->AddMatVec(alpha, M, kNoTrans, ones, beta);
+//}
+//}
+
+// template<typename Real>
+// Real VectorBase<Real>::LogSumExp(Real prune) const {
+// Real sum;
+// if (sizeof(sum) == 8) sum = kLogZeroDouble;
+// else sum = kLogZeroFloat;
+// Real max_elem = Max(), cutoff;
+// if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat;
+// else cutoff = max_elem + kMinLogDiffDouble;
+// if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning...
+// cutoff = max_elem - prune;
+
+// double sum_relto_max_elem = 0.0;
+
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// BaseFloat f = data_[i];
+// if (f >= cutoff)
+// sum_relto_max_elem += Exp(f - max_elem);
+//}
+// return max_elem + Log(sum_relto_max_elem);
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::InvertElements() {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] = static_cast<Real>(1 / data_[i]);
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::ApplyLog() {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// if (data_[i] < 0.0)
+// KALDI_ERR << "Trying to take log of a negative number.";
+// data_[i] = Log(data_[i]);
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::ApplyLogAndCopy(const VectorBase<Real> &v) {
+// KALDI_ASSERT(dim_ == v.Dim());
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] = Log(v(i));
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::ApplyExp() {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] = Exp(data_[i]);
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::ApplyAbs() {
+// for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); }
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::Floor(const VectorBase<Real> &v, Real floor_val,
+// MatrixIndexT *floored_count) {
+// KALDI_ASSERT(dim_ == v.dim_);
+// if (floored_count == nullptr) {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] = std::max(v.data_[i], floor_val);
+//}
+//} else {
+// MatrixIndexT num_floored = 0;
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// if (v.data_[i] < floor_val) {
+// data_[i] = floor_val;
+// num_floored++;
+//} else {
+// data_[i] = v.data_[i];
+//}
+//}
+//*floored_count = num_floored;
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::Ceiling(const VectorBase<Real> &v, Real ceil_val,
+// MatrixIndexT *ceiled_count) {
+// KALDI_ASSERT(dim_ == v.dim_);
+// if (ceiled_count == nullptr) {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] = std::min(v.data_[i], ceil_val);
+//}
+//} else {
+// MatrixIndexT num_changed = 0;
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// if (v.data_[i] > ceil_val) {
+// data_[i] = ceil_val;
+// num_changed++;
+//} else {
+// data_[i] = v.data_[i];
+//}
+//}
+//*ceiled_count = num_changed;
+//}
+//}
+
+// template<typename Real>
+// MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec)
+// {
+// KALDI_ASSERT(floor_vec.Dim() == dim_);
+// MatrixIndexT num_floored = 0;
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// if (data_[i] < floor_vec(i)) {
+// data_[i] = floor_vec(i);
+// num_floored++;
+//}
+//}
+// return num_floored;
+//}
+
+// template<typename Real>
+// Real VectorBase<Real>::ApplySoftMax() {
+// Real max = this->Max(), sum = 0.0;
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// sum += (data_[i] = Exp(data_[i] - max));
+//}
+// this->Scale(1.0 / sum);
+// return max + Log(sum);
+//}
+
+// template<typename Real>
+// Real VectorBase<Real>::ApplyLogSoftMax() {
+// Real max = this->Max(), sum = 0.0;
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// sum += Exp((data_[i] -= max));
+//}
+// sum = Log(sum);
+// this->Add(-1.0 * sum);
+// return max + sum;
 //}
 
 //#ifdef HAVE_MKL
-//template<>
-//void VectorBase<float>::Tanh(const VectorBase<float> &src) {
-  //KALDI_ASSERT(dim_ == src.dim_);
-  //vsTanh(dim_, src.data_, data_);
+// template<>
+// void VectorBase<float>::Tanh(const VectorBase<float> &src) {
+// KALDI_ASSERT(dim_ == src.dim_);
+// vsTanh(dim_, src.data_, data_);
 //}
-//template<>
-//void VectorBase<double>::Tanh(const VectorBase<double> &src) {
-  //KALDI_ASSERT(dim_ == src.dim_);
-  //vdTanh(dim_, src.data_, data_);
+// template<>
+// void VectorBase<double>::Tanh(const VectorBase<double> &src) {
+// KALDI_ASSERT(dim_ == src.dim_);
+// vdTanh(dim_, src.data_, data_);
 //}
 //#else
-//template<typename Real>
-//void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
-  //KALDI_ASSERT(dim_ == src.dim_);
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //Real x = src.data_[i];
-    //if (x > 0.0) {
-      //Real inv_expx = Exp(-x);
-      //x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
-    //} else {
-      //Real expx = Exp(x);
-      //x = 1.0 - 2.0 / (1.0 + expx * expx);
-    //}
-    //data_[i] = x;
-  //}
+// template<typename Real>
+// void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
+// KALDI_ASSERT(dim_ == src.dim_);
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// Real x = src.data_[i];
+// if (x > 0.0) {
+// Real inv_expx = Exp(-x);
+// x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
+//} else {
+// Real expx = Exp(x);
+// x = 1.0 - 2.0 / (1.0 + expx * expx);
+//}
+// data_[i] = x;
+//}
 //}
 //#endif
 
 //#ifdef HAVE_MKL
 //// Implementing sigmoid based on tanh.
-//template<>
-//void VectorBase<float>::Sigmoid(const VectorBase<float> &src) {
-  //KALDI_ASSERT(dim_ == src.dim_);
-  //this->CopyFromVec(src);
-  //this->Scale(0.5);
-  //vsTanh(dim_, data_, data_);
-  //this->Add(1.0);
-  //this->Scale(0.5);
-//}
-//template<>
-//void VectorBase<double>::Sigmoid(const VectorBase<double> &src) {
-  //KALDI_ASSERT(dim_ == src.dim_);
-  //this->CopyFromVec(src);
-  //this->Scale(0.5);
-  //vdTanh(dim_, data_, data_);
-  //this->Add(1.0);
-  //this->Scale(0.5);
+// template<>
+// void VectorBase<float>::Sigmoid(const VectorBase<float> &src) {
+// KALDI_ASSERT(dim_ == src.dim_);
+// this->CopyFromVec(src);
+// this->Scale(0.5);
+// vsTanh(dim_, data_, data_);
+// this->Add(1.0);
+// this->Scale(0.5);
+//}
+// template<>
+// void VectorBase<double>::Sigmoid(const VectorBase<double> &src) {
+// KALDI_ASSERT(dim_ == src.dim_);
+// this->CopyFromVec(src);
+// this->Scale(0.5);
+// vdTanh(dim_, data_, data_);
+// this->Add(1.0);
+// this->Scale(0.5);
 //}
 //#else
-//template<typename Real>
-//void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
-  //KALDI_ASSERT(dim_ == src.dim_);
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //Real x = src.data_[i];
-    //// We aim to avoid floating-point overflow here.
-    //if (x > 0.0) {
-      //x = 1.0 / (1.0 + Exp(-x));
-    //} else {
-      //Real ex = Exp(x);
-      //x = ex / (ex + 1.0);
-    //}
-    //data_[i] = x;
-  //}
+// template<typename Real>
+// void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
+// KALDI_ASSERT(dim_ == src.dim_);
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// Real x = src.data_[i];
+//// We aim to avoid floating-point overflow here.
+// if (x > 0.0) {
+// x = 1.0 / (1.0 + Exp(-x));
+//} else {
+// Real ex = Exp(x);
+// x = ex / (ex + 1.0);
+//}
+// data_[i] = x;
+//}
 //}
 //#endif
 
 
-//template<typename Real>
-//void VectorBase<Real>::Add(Real c) {
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] += c;
-  //}
+// template<typename Real>
+// void VectorBase<Real>::Add(Real c) {
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] += c;
+//}
 //}
 
-//template<typename Real>
-//void VectorBase<Real>::Scale(Real alpha) {
-  //cblas_Xscal(dim_, alpha, data_, 1);
+// template<typename Real>
+// void VectorBase<Real>::Scale(Real alpha) {
+// cblas_Xscal(dim_, alpha, data_, 1);
 //}
 
-//template<typename Real>
-//void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] *= v.data_[i];
-  //}
+// template<typename Real>
+// void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
+// KALDI_ASSERT(dim_ == v.dim_);
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] *= v.data_[i];
+//}
 //}
 
-//template<typename Real>  // Set each element to y = (x == orig ? changed : x).
-//void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
-  //Real *data = data_;
-  //for (MatrixIndexT i = 0; i < dim_; i++)
-    //if (data[i] == orig) data[i] = changed;
+// template<typename Real>  // Set each element to y = (x == orig ? changed :
+// x).
+// void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
+// Real *data = data_;
+// for (MatrixIndexT i = 0; i < dim_; i++)
+// if (data[i] == orig) data[i] = changed;
 //}
 
 
-//template<typename Real>
-//template<typename OtherReal>
-//void VectorBase<Real>::MulElements(const VectorBase<OtherReal> &v) {
-  //KALDI_ASSERT(dim_ == v.Dim());
-  //const OtherReal *other_ptr = v.Data();
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] *= other_ptr[i];
-  //}
+// template<typename Real>
+// template<typename OtherReal>
+// void VectorBase<Real>::MulElements(const VectorBase<OtherReal> &v) {
+// KALDI_ASSERT(dim_ == v.Dim());
+// const OtherReal *other_ptr = v.Data();
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] *= other_ptr[i];
+//}
 //}
 //// instantiate template.
-//template
-//void VectorBase<float>::MulElements(const VectorBase<double> &v);
-//template
-//void VectorBase<double>::MulElements(const VectorBase<float> &v);
-
-
-//template<typename Real>
-//void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
-                                 //const VectorBase<Real> &r, Real beta) {
-  //KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
-  //// We pretend that v is a band-diagonal matrix.
-  //KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  //cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
-              //r.data_, 1, beta, this->data_, 1);
+// template
+// void VectorBase<float>::MulElements(const VectorBase<double> &v);
+// template
+// void VectorBase<double>::MulElements(const VectorBase<float> &v);
+
+
+// template<typename Real>
+// void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
+// const VectorBase<Real> &r, Real beta) {
+// KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
+//// We pretend that v is a band-diagonal matrix.
+// KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
+// cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
+// r.data_, 1, beta, this->data_, 1);
 //}
 
 
-//template<typename Real>
-//void VectorBase<Real>::DivElements(const VectorBase<Real> &v) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] /= v.data_[i];
-  //}
+// template<typename Real>
+// void VectorBase<Real>::DivElements(const VectorBase<Real> &v) {
+// KALDI_ASSERT(dim_ == v.dim_);
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] /= v.data_[i];
+//}
 //}
 
-//template<typename Real>
-//template<typename OtherReal>
-//void VectorBase<Real>::DivElements(const VectorBase<OtherReal> &v) {
-  //KALDI_ASSERT(dim_ == v.Dim());
-  //const OtherReal *other_ptr = v.Data();
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] /= other_ptr[i];
-  //}
+// template<typename Real>
+// template<typename OtherReal>
+// void VectorBase<Real>::DivElements(const VectorBase<OtherReal> &v) {
+// KALDI_ASSERT(dim_ == v.Dim());
+// const OtherReal *other_ptr = v.Data();
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] /= other_ptr[i];
+//}
 //}
 //// instantiate template.
-//template
-//void VectorBase<float>::DivElements(const VectorBase<double> &v);
-//template
-//void VectorBase<double>::DivElements(const VectorBase<float> &v);
-
-//template<typename Real>
-//void VectorBase<Real>::AddVecDivVec(Real alpha, const VectorBase<Real> &v,
-                                    //const VectorBase<Real> &rr, Real beta) {
-  //KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_));
-  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    //data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ;
-  //}
-//}
-
-//template<typename Real>
-//template<typename OtherReal>
-//void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //// remove __restrict__ if it causes compilation problems.
-  //Real *__restrict__ data = data_;
-  //OtherReal *__restrict__ other_data = v.data_;
-  //MatrixIndexT dim = dim_;
-  //if (alpha != 1.0)
-    //for (MatrixIndexT i = 0; i < dim; i++)
-      //data[i] += alpha * other_data[i];
-  //else
-    //for (MatrixIndexT i = 0; i < dim; i++)
-      //data[i] += other_data[i];
-//}
-
-//template
-//void VectorBase<float>::AddVec(const float alpha, const VectorBase<double> &v);
-//template
-//void VectorBase<double>::AddVec(const double alpha, const VectorBase<float> &v);
-
-//template<typename Real>
-//template<typename OtherReal>
-//void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //// remove __restrict__ if it causes compilation problems.
-  //Real *__restrict__ data = data_;
-  //OtherReal *__restrict__ other_data = v.data_;
-  //MatrixIndexT dim = dim_;
-  //if (alpha != 1.0)
-    //for (MatrixIndexT i = 0; i < dim; i++)
-      //data[i] += alpha * other_data[i] * other_data[i];
-  //else
-    //for (MatrixIndexT i = 0; i < dim; i++)
-      //data[i] += other_data[i] * other_data[i];
-//}
-
-//template
-//void VectorBase<float>::AddVec2(const float alpha, const VectorBase<double> &v);
-//template
-//void VectorBase<double>::AddVec2(const double alpha, const VectorBase<float> &v);
+// template
+// void VectorBase<float>::DivElements(const VectorBase<double> &v);
+// template
+// void VectorBase<double>::DivElements(const VectorBase<float> &v);
+
+// template<typename Real>
+// void VectorBase<Real>::AddVecDivVec(Real alpha, const VectorBase<Real> &v,
+// const VectorBase<Real> &rr, Real beta) {
+// KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_));
+// for (MatrixIndexT i = 0; i < dim_; i++) {
+// data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ;
+//}
+//}
 
+// template<typename Real>
+// template<typename OtherReal>
+// void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal>
+// &v) {
+// KALDI_ASSERT(dim_ == v.dim_);
+//// remove __restrict__ if it causes compilation problems.
+// Real *__restrict__ data = data_;
+// OtherReal *__restrict__ other_data = v.data_;
+// MatrixIndexT dim = dim_;
+// if (alpha != 1.0)
+// for (MatrixIndexT i = 0; i < dim; i++)
+// data[i] += alpha * other_data[i];
+// else
+// for (MatrixIndexT i = 0; i < dim; i++)
+// data[i] += other_data[i];
+//}
 
-template<typename Real>
-void VectorBase<Real>::Read(std::istream &is,  bool binary) {
-  //  In order to avoid rewriting this, we just declare a Vector and
-  // use it to read the data, then copy.
-  Vector<Real> tmp;
-  tmp.Read(is, binary);
-  if (tmp.Dim() != Dim())
-    KALDI_ERR << "VectorBase<Real>::Read, size mismatch "
-              << Dim() << " vs. " << tmp.Dim();
-  CopyFromVec(tmp);
+// template
+// void VectorBase<float>::AddVec(const float alpha, const VectorBase<double>
+// &v);
+// template
+// void VectorBase<double>::AddVec(const double alpha, const VectorBase<float>
+// &v);
+
+// template<typename Real>
+// template<typename OtherReal>
+// void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal>
+// &v) {
+// KALDI_ASSERT(dim_ == v.dim_);
+//// remove __restrict__ if it causes compilation problems.
+// Real *__restrict__ data = data_;
+// OtherReal *__restrict__ other_data = v.data_;
+// MatrixIndexT dim = dim_;
+// if (alpha != 1.0)
+// for (MatrixIndexT i = 0; i < dim; i++)
+// data[i] += alpha * other_data[i] * other_data[i];
+// else
+// for (MatrixIndexT i = 0; i < dim; i++)
+// data[i] += other_data[i] * other_data[i];
+//}
+
+// template
+// void VectorBase<float>::AddVec2(const float alpha, const VectorBase<double>
+// &v);
+// template
+// void VectorBase<double>::AddVec2(const double alpha, const VectorBase<float>
+// &v);
+
+
+template <typename Real>
+void VectorBase<Real>::Read(std::istream &is, bool binary) {
+    //  In order to avoid rewriting this, we just declare a Vector and
+    // use it to read the data, then copy.
+    Vector<Real> tmp;
+    tmp.Read(is, binary);
+    if (tmp.Dim() != Dim())
+        KALDI_ERR << "VectorBase<Real>::Read, size mismatch " << Dim()
+                  << " vs. " << tmp.Dim();
+    CopyFromVec(tmp);
 }
 
 
-template<typename Real>
-void Vector<Real>::Read(std::istream &is,  bool binary) {
-  std::ostringstream specific_error;
-  MatrixIndexT pos_at_start = is.tellg();
-
-  if (binary) {
-    int peekval = Peek(is, binary);
-    const char *my_token =  (sizeof(Real) == 4 ? "FV" : "DV");
-    char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
-    if (peekval == other_token_start) {  // need to instantiate the other type to read it.
-      typedef typename OtherReal<Real>::Real OtherType;  // if Real == float, OtherType == double, and vice versa.
-      Vector<OtherType> other(this->Dim());
-      other.Read(is, binary);  // add is false at this point.
-      if (this->Dim() != other.Dim()) this->Resize(other.Dim());
-      this->CopyFromVec(other);
-      return;
-    }
-    std::string token;
-    ReadToken(is, binary, &token);
-    if (token != my_token) {
-      if (token.length() > 20) token = token.substr(0, 17) + "...";
-      specific_error << ": Expected token " << my_token << ", got " << token;
-      goto bad;
-    }
-    int32 size;
-    ReadBasicType(is, binary, &size);  // throws on error.
-    if ((MatrixIndexT)size != this->Dim())  this->Resize(size);
-    if (size > 0)
-      is.read(reinterpret_cast<char*>(this->data_), sizeof(Real)*size);
-    if (is.fail()) {
-      specific_error << "Error reading vector data (binary mode); truncated "
-          "stream? (size = " << size << ")";
-      goto bad;
-    }
-    return;
-  } else {  // Text mode reading; format is " [ 1.1 2.0 3.4 ]\n"
-    std::string s;
-    is >> s;
-    // if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) {  // Back compatibility.
-    //  is >> s;  // get dimension
-    //  is >> s;  // get "["
-    // }
-    if (is.fail()) { specific_error << "EOF while trying to read vector."; goto bad; }
-    if (s.compare("[]") == 0) { Resize(0); return; } // tolerate this variant.
-    if (s.compare("[")) {
-      if (s.length() > 20) s = s.substr(0, 17) + "...";
-      specific_error << "Expected \"[\" but got " << s;
-      goto bad;
-    }
-    std::vector<Real> data;
-    while (1) {
-      int i = is.peek();
-      if (i == '-' || (i >= '0' && i <= '9')) {  // common cases first.
-        Real r;
-        is >> r;
-        if (is.fail()) { specific_error << "Failed to read number."; goto bad; }
-        if (! std::isspace(is.peek()) && is.peek() != ']') {
-          specific_error << "Expected whitespace after number."; goto bad;
+template <typename Real>
+void Vector<Real>::Read(std::istream &is, bool binary) {
+    std::ostringstream specific_error;
+    MatrixIndexT pos_at_start = is.tellg();
+
+    if (binary) {
+        int peekval = Peek(is, binary);
+        const char *my_token = (sizeof(Real) == 4 ? "FV" : "DV");
+        char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
+        if (peekval == other_token_start) {  // need to instantiate the other
+                                             // type to read it.
+            typedef typename OtherReal<Real>::Real OtherType;  // if Real ==
+                                                               // float,
+                                                               // OtherType ==
+                                                               // double, and
+                                                               // vice versa.
+            Vector<OtherType> other(this->Dim());
+            other.Read(is, binary);  // add is false at this point.
+            if (this->Dim() != other.Dim()) this->Resize(other.Dim());
+            this->CopyFromVec(other);
+            return;
+        }
+        std::string token;
+        ReadToken(is, binary, &token);
+        if (token != my_token) {
+            if (token.length() > 20) token = token.substr(0, 17) + "...";
+            specific_error << ": Expected token " << my_token << ", got "
+                           << token;
+            goto bad;
         }
-        data.push_back(r);
-        // But don't eat whitespace... we want to check that it's not newlines
-        // which would be valid only for a matrix.
-      } else if (i == ' ' || i == '\t') {
-        is.get();
-      } else if (i == ']') {
-        is.get();  // eat the ']'
-        this->Resize(data.size());
-        for (size_t j = 0; j < data.size(); j++)
-          this->data_[j] = data[j];
-        i = is.peek();
-        if (static_cast<char>(i) == '\r') {
-          is.get();
-          is.get();  // get \r\n (must eat what we wrote)
-        } else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
+        int32 size;
+        ReadBasicType(is, binary, &size);  // throws on error.
+        if ((MatrixIndexT)size != this->Dim()) this->Resize(size);
+        if (size > 0)
+            is.read(reinterpret_cast<char *>(this->data_), sizeof(Real) * size);
         if (is.fail()) {
-          KALDI_WARN << "After end of vector data, read error.";
-          // we got the data we needed, so just warn for this error.
+            specific_error
+                << "Error reading vector data (binary mode); truncated "
+                   "stream? (size = "
+                << size << ")";
+            goto bad;
         }
-        return;  // success.
-      } else if (i == -1) {
-        specific_error << "EOF while reading vector data.";
-        goto bad;
-      } else if (i == '\n' || i == '\r') {
-        specific_error << "Newline found while reading vector (maybe it's a matrix?)";
-        goto bad;
-      } else {
-        is >> s;  // read string.
-        if (!KALDI_STRCASECMP(s.c_str(), "inf") ||
-            !KALDI_STRCASECMP(s.c_str(), "infinity")) {
-          data.push_back(std::numeric_limits<Real>::infinity());
-          KALDI_WARN << "Reading infinite value into vector.";
-        } else if (!KALDI_STRCASECMP(s.c_str(), "nan")) {
-          data.push_back(std::numeric_limits<Real>::quiet_NaN());
-          KALDI_WARN << "Reading NaN value into vector.";
-        } else {
-          if (s.length() > 20) s = s.substr(0, 17) + "...";
-          specific_error << "Expecting numeric vector data, got " << s;
-          goto  bad;
+        return;
+    } else {  // Text mode reading; format is " [ 1.1 2.0 3.4 ]\n"
+        std::string s;
+        is >> s;
+        // if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) {  // Back
+        // compatibility.
+        //  is >> s;  // get dimension
+        //  is >> s;  // get "["
+        // }
+        if (is.fail()) {
+            specific_error << "EOF while trying to read vector.";
+            goto bad;
+        }
+        if (s.compare("[]") == 0) {
+            Resize(0);
+            return;
+        }  // tolerate this variant.
+        if (s.compare("[")) {
+            if (s.length() > 20) s = s.substr(0, 17) + "...";
+            specific_error << "Expected \"[\" but got " << s;
+            goto bad;
+        }
+        std::vector<Real> data;
+        while (1) {
+            int i = is.peek();
+            if (i == '-' || (i >= '0' && i <= '9')) {  // common cases first.
+                Real r;
+                is >> r;
+                if (is.fail()) {
+                    specific_error << "Failed to read number.";
+                    goto bad;
+                }
+                if (!std::isspace(is.peek()) && is.peek() != ']') {
+                    specific_error << "Expected whitespace after number.";
+                    goto bad;
+                }
+                data.push_back(r);
+                // But don't eat whitespace... we want to check that it's not
+                // newlines
+                // which would be valid only for a matrix.
+            } else if (i == ' ' || i == '\t') {
+                is.get();
+            } else if (i == ']') {
+                is.get();  // eat the ']'
+                this->Resize(data.size());
+                for (size_t j = 0; j < data.size(); j++)
+                    this->data_[j] = data[j];
+                i = is.peek();
+                if (static_cast<char>(i) == '\r') {
+                    is.get();
+                    is.get();  // get \r\n (must eat what we wrote)
+                } else if (static_cast<char>(i) == '\n') {
+                    is.get();
+                }  // get \n (must eat what we wrote)
+                if (is.fail()) {
+                    KALDI_WARN << "After end of vector data, read error.";
+                    // we got the data we needed, so just warn for this error.
+                }
+                return;  // success.
+            } else if (i == -1) {
+                specific_error << "EOF while reading vector data.";
+                goto bad;
+            } else if (i == '\n' || i == '\r') {
+                specific_error << "Newline found while reading vector (maybe "
+                                  "it's a matrix?)";
+                goto bad;
+            } else {
+                is >> s;  // read string.
+                if (!KALDI_STRCASECMP(s.c_str(), "inf") ||
+                    !KALDI_STRCASECMP(s.c_str(), "infinity")) {
+                    data.push_back(std::numeric_limits<Real>::infinity());
+                    KALDI_WARN << "Reading infinite value into vector.";
+                } else if (!KALDI_STRCASECMP(s.c_str(), "nan")) {
+                    data.push_back(std::numeric_limits<Real>::quiet_NaN());
+                    KALDI_WARN << "Reading NaN value into vector.";
+                } else {
+                    if (s.length() > 20) s = s.substr(0, 17) + "...";
+                    specific_error << "Expecting numeric vector data, got "
+                                   << s;
+                    goto bad;
+                }
+            }
         }
-      }
     }
-  }
-  // we never reach this line (the while loop returns directly).
+// we never reach this line (the while loop returns directly).
 bad:
-  KALDI_ERR << "Failed to read vector from stream.  " << specific_error.str()
-            << " File position at start is "
-            << pos_at_start<<", currently "<<is.tellg();
+    KALDI_ERR << "Failed to read vector from stream.  " << specific_error.str()
+              << " File position at start is " << pos_at_start << ", currently "
+              << is.tellg();
 }
 
 
-template<typename Real>
-void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
-  if (!os.good()) {
-    KALDI_ERR << "Failed to write vector to stream: stream not good";
-  }
-  if (binary) {
-    std::string my_token = (sizeof(Real) == 4 ? "FV" : "DV");
-    WriteToken(os, binary, my_token);
-
-    int32 size = Dim();  // make the size 32-bit on disk.
-    KALDI_ASSERT(Dim() == (MatrixIndexT) size);
-    WriteBasicType(os, binary, size);
-    os.write(reinterpret_cast<const char*>(Data()), sizeof(Real) * size);
-  } else {
-    os << " [ ";
-    for (MatrixIndexT i = 0; i < Dim(); i++)
-      os << (*this)(i) << " ";
-    os << "]\n";
-  }
-  if (!os.good())
-    KALDI_ERR << "Failed to write vector to stream";
+template <typename Real>
+void VectorBase<Real>::Write(std::ostream &os, bool binary) const {
+    if (!os.good()) {
+        KALDI_ERR << "Failed to write vector to stream: stream not good";
+    }
+    if (binary) {
+        std::string my_token = (sizeof(Real) == 4 ? "FV" : "DV");
+        WriteToken(os, binary, my_token);
+
+        int32 size = Dim();  // make the size 32-bit on disk.
+        KALDI_ASSERT(Dim() == (MatrixIndexT)size);
+        WriteBasicType(os, binary, size);
+        os.write(reinterpret_cast<const char *>(Data()), sizeof(Real) * size);
+    } else {
+        os << " [ ";
+        for (MatrixIndexT i = 0; i < Dim(); i++) os << (*this)(i) << " ";
+        os << "]\n";
+    }
+    if (!os.good()) KALDI_ERR << "Failed to write vector to stream";
 }
 
 
-//template<typename Real>
-//void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
-  //KALDI_ASSERT(dim_ == v.dim_);
-  //for (MatrixIndexT i = 0; i < dim_; i++)
-    //data_[i] += alpha * v.data_[i] * v.data_[i];
+// template<typename Real>
+// void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
+// KALDI_ASSERT(dim_ == v.dim_);
+// for (MatrixIndexT i = 0; i < dim_; i++)
+// data_[i] += alpha * v.data_[i] * v.data_[i];
 //}
 
 //// this <-- beta*this + alpha*M*v.
-//template<typename Real>
-//void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
-                                //const MatrixTransposeType trans,
-                                //const VectorBase<Real> &v,
-                                //const Real beta) {
-  //KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
-  //if (beta == 0.0) {
-    //if (&v != this) CopyFromVec(v);
-    //MulTp(M, trans);
-    //if (alpha != 1.0) Scale(alpha);
-  //} else {
-    //Vector<Real> tmp(v);
-    //tmp.MulTp(M, trans);
-    //if (beta != 1.0) Scale(beta);  // *this <-- beta * *this
-    //AddVec(alpha, tmp);          // *this += alpha * M * v
-  //}
-//}
-
-//template<typename Real>
-//Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
-               //const VectorBase<Real> &v2) {
-  //KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
-  //Vector<Real> vtmp(M.NumRows());
-  //vtmp.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
-  //return VecVec(v1, vtmp);
-//}
-
-//template
-//float VecMatVec(const VectorBase<float> &v1, const MatrixBase<float> &M,
-                //const VectorBase<float> &v2);
-//template
-//double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
-                 //const VectorBase<double> &v2);
+// template<typename Real>
+// void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
+// const MatrixTransposeType trans,
+// const VectorBase<Real> &v,
+// const Real beta) {
+// KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
+// if (beta == 0.0) {
+// if (&v != this) CopyFromVec(v);
+// MulTp(M, trans);
+// if (alpha != 1.0) Scale(alpha);
+//} else {
+// Vector<Real> tmp(v);
+// tmp.MulTp(M, trans);
+// if (beta != 1.0) Scale(beta);  // *this <-- beta * *this
+// AddVec(alpha, tmp);          // *this += alpha * M * v
+//}
+//}
 
-template<typename Real>
+// template<typename Real>
+// Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
+// const VectorBase<Real> &v2) {
+// KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
+// Vector<Real> vtmp(M.NumRows());
+// vtmp.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
+// return VecVec(v1, vtmp);
+//}
+
+// template
+// float VecMatVec(const VectorBase<float> &v1, const MatrixBase<float> &M,
+// const VectorBase<float> &v2);
+// template
+// double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
+// const VectorBase<double> &v2);
+
+template <typename Real>
 void Vector<Real>::Swap(Vector<Real> *other) {
-  std::swap(this->data_, other->data_);
-  std::swap(this->dim_, other->dim_);
+    std::swap(this->data_, other->data_);
+    std::swap(this->dim_, other->dim_);
 }
 
 
-//template<typename Real>
-//void VectorBase<Real>::AddDiagMat2(
-    //Real alpha, const MatrixBase<Real> &M,
-    //MatrixTransposeType trans, Real beta) {
-  //if (trans == kNoTrans) {
-    //KALDI_ASSERT(this->dim_ == M.NumRows());
-    //MatrixIndexT rows = this->dim_, cols = M.NumCols(),
-           //mat_stride = M.Stride();
-    //Real *data = this->data_;
-    //const Real *mat_data = M.Data();
-    //for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
-      //*data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
-  //} else {
-    //KALDI_ASSERT(this->dim_ == M.NumCols());
-    //MatrixIndexT rows = M.NumRows(), cols = this->dim_,
-           //mat_stride = M.Stride();
-    //Real *data = this->data_;
-    //const Real *mat_data = M.Data();
-    //for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++)
-      //*data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride,
-                                                 //mat_data, mat_stride);
-  //}
-//}
-
-//template<typename Real>
-//void VectorBase<Real>::AddDiagMatMat(
-    //Real alpha,
-    //const MatrixBase<Real> &M, MatrixTransposeType transM,
-    //const MatrixBase<Real> &N, MatrixTransposeType transN,
-    //Real beta) {
-  //MatrixIndexT dim = this->dim_,
-      //M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
-      //N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
-  //KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
-  //MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
-  //if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
-  //MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
-  //if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
-
-  //Real *data = this->data_;
-  //const Real *Mdata = M.Data(), *Ndata = N.Data();
-  //for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
-    //*data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
-  //}
+// template<typename Real>
+// void VectorBase<Real>::AddDiagMat2(
+// Real alpha, const MatrixBase<Real> &M,
+// MatrixTransposeType trans, Real beta) {
+// if (trans == kNoTrans) {
+// KALDI_ASSERT(this->dim_ == M.NumRows());
+// MatrixIndexT rows = this->dim_, cols = M.NumCols(),
+// mat_stride = M.Stride();
+// Real *data = this->data_;
+// const Real *mat_data = M.Data();
+// for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
+//*data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
+//} else {
+// KALDI_ASSERT(this->dim_ == M.NumCols());
+// MatrixIndexT rows = M.NumRows(), cols = this->dim_,
+// mat_stride = M.Stride();
+// Real *data = this->data_;
+// const Real *mat_data = M.Data();
+// for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++)
+//*data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride,
+// mat_data, mat_stride);
+//}
+//}
+
+// template<typename Real>
+// void VectorBase<Real>::AddDiagMatMat(
+// Real alpha,
+// const MatrixBase<Real> &M, MatrixTransposeType transM,
+// const MatrixBase<Real> &N, MatrixTransposeType transN,
+// Real beta) {
+// MatrixIndexT dim = this->dim_,
+// M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
+// N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
+// KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
+// MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
+// if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
+// MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
+// if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
+
+// Real *data = this->data_;
+// const Real *Mdata = M.Data(), *Ndata = N.Data();
+// for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata +=
+// N_col_stride, data++) {
+//*data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride,
+//Ndata, N_row_stride);
+//}
 //}
 
 
diff --git a/runtime/engine/common/matrix/kaldi-vector.h b/runtime/engine/common/matrix/kaldi-vector.h
index 5bcbeda90..461e026d3 100644
--- a/runtime/engine/common/matrix/kaldi-vector.h
+++ b/runtime/engine/common/matrix/kaldi-vector.h
@@ -37,265 +37,274 @@ namespace kaldi {
 ///  Provides a vector abstraction class.
 ///  This class provides a way to work with vectors in kaldi.
 ///  It encapsulates basic operations and memory optimizations.
-template<typename Real>
+template <typename Real>
 class VectorBase {
- public:
-  /// Set vector to all zeros.
-  void SetZero();
-
-  /// Returns true if matrix is all zeros.
-  bool IsZero(Real cutoff = 1.0e-06) const;     // replace magic number
-
-  /// Set all members of a vector to a specified value.
-  void Set(Real f);
-
-  /// Returns the  dimension of the vector.
-  inline MatrixIndexT Dim() const { return dim_; }
-
-  /// Returns the size in memory of the vector, in bytes.
-  inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
-
-  /// Returns a pointer to the start of the vector's data.
-  inline Real* Data() { return data_; }
-
-  /// Returns a pointer to the start of the vector's data (const).
-  inline const Real* Data() const { return data_; }
-
-  /// Indexing  operator (const).
-  inline Real operator() (MatrixIndexT i) const {
-    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i);
-  }
-
-  /// Indexing operator (non-const).
-  inline Real & operator() (MatrixIndexT i) {
-    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i);
-  }
-
-  /** @brief Returns a sub-vector of a vector (a range of elements).
-   *  @param o [in] Origin, 0 < o < Dim()
-   *  @param l [in] Length 0 < l < Dim()-o
-   *  @return A SubVector object that aliases the data of the Vector object.
-   *  See @c SubVector class for details   */
-  SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
-    return SubVector<Real>(*this, o, l);
-  }
-
-  /** @brief Returns a const sub-vector of a vector (a range of elements).
-   *  @param o [in] Origin, 0 < o < Dim()
-   *  @param l [in] Length 0 < l < Dim()-o
-   *  @return A SubVector object that aliases the data of the Vector object.
-   *  See @c SubVector class for details   */
-  const SubVector<Real> Range(const MatrixIndexT o,
-                              const MatrixIndexT l) const {
-    return SubVector<Real>(*this, o, l);
-  }
-
-  /// Copy data from another vector (must match own size).
-  void CopyFromVec(const VectorBase<Real> &v);
-
-  /// Copy data from another vector of different type (double vs. float)
-  template<typename OtherReal>
-  void CopyFromVec(const VectorBase<OtherReal> &v);
-
-  /// Performs a row stack of the matrix M
-  void CopyRowsFromMat(const MatrixBase<Real> &M);
-  template<typename OtherReal>
-  void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
-
-  /// Performs a column stack of the matrix M
-  void CopyColsFromMat(const MatrixBase<Real> &M);
-
-  /// Extracts a row of the matrix M.  Could also do this with
-  /// this->Copy(M[row]).
-  void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
-  /// Extracts a row of the matrix M with type conversion.
-  template<typename OtherReal>
-  void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
-
-  /// Extracts a column of the matrix M.
-  template<typename OtherReal>
-  void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
-
-  /// Reads from C++ stream (option to add to existing contents).
-  /// Throws exception on failure
-  void Read(std::istream &in, bool binary);
-
-  /// Writes to C++ stream (option to write in binary).
-  void Write(std::ostream &Out, bool binary) const;
-
-  friend class VectorBase<double>;
-  friend class VectorBase<float>;
- protected:
-  /// Destructor;  does not deallocate memory, this is handled by child classes.
-  /// This destructor is protected so this object can only be
-  /// deleted via a child.
-  ~VectorBase() {}
-
-  /// Empty initializer, corresponds to vector of zero size.
-  explicit VectorBase(): data_(NULL), dim_(0) {
-    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
-  }
-
-  /// data memory area
-  Real* data_;
-  /// dimension of vector
-  MatrixIndexT dim_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
-}; // class VectorBase
+  public:
+    /// Set vector to all zeros.
+    void SetZero();
+
+    /// Returns true if matrix is all zeros.
+    bool IsZero(Real cutoff = 1.0e-06) const;  // replace magic number
+
+    /// Set all members of a vector to a specified value.
+    void Set(Real f);
+
+    /// Returns the  dimension of the vector.
+    inline MatrixIndexT Dim() const { return dim_; }
+
+    /// Returns the size in memory of the vector, in bytes.
+    inline MatrixIndexT SizeInBytes() const { return (dim_ * sizeof(Real)); }
+
+    /// Returns a pointer to the start of the vector's data.
+    inline Real *Data() { return data_; }
+
+    /// Returns a pointer to the start of the vector's data (const).
+    inline const Real *Data() const { return data_; }
+
+    /// Indexing  operator (const).
+    inline Real operator()(MatrixIndexT i) const {
+        KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                              static_cast<UnsignedMatrixIndexT>(dim_));
+        return *(data_ + i);
+    }
+
+    /// Indexing operator (non-const).
+    inline Real &operator()(MatrixIndexT i) {
+        KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                              static_cast<UnsignedMatrixIndexT>(dim_));
+        return *(data_ + i);
+    }
+
+    /** @brief Returns a sub-vector of a vector (a range of elements).
+     *  @param o [in] Origin, 0 < o < Dim()
+     *  @param l [in] Length 0 < l < Dim()-o
+     *  @return A SubVector object that aliases the data of the Vector object.
+     *  See @c SubVector class for details   */
+    SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
+        return SubVector<Real>(*this, o, l);
+    }
+
+    /** @brief Returns a const sub-vector of a vector (a range of elements).
+     *  @param o [in] Origin, 0 < o < Dim()
+     *  @param l [in] Length 0 < l < Dim()-o
+     *  @return A SubVector object that aliases the data of the Vector object.
+     *  See @c SubVector class for details   */
+    const SubVector<Real> Range(const MatrixIndexT o,
+                                const MatrixIndexT l) const {
+        return SubVector<Real>(*this, o, l);
+    }
+
+    /// Copy data from another vector (must match own size).
+    void CopyFromVec(const VectorBase<Real> &v);
+
+    /// Copy data from another vector of different type (double vs. float)
+    template <typename OtherReal>
+    void CopyFromVec(const VectorBase<OtherReal> &v);
+
+    /// Performs a row stack of the matrix M
+    void CopyRowsFromMat(const MatrixBase<Real> &M);
+    template <typename OtherReal>
+    void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
+
+    /// Performs a column stack of the matrix M
+    void CopyColsFromMat(const MatrixBase<Real> &M);
+
+    /// Extracts a row of the matrix M.  Could also do this with
+    /// this->Copy(M[row]).
+    void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
+    /// Extracts a row of the matrix M with type conversion.
+    template <typename OtherReal>
+    void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
+
+    /// Extracts a column of the matrix M.
+    template <typename OtherReal>
+    void CopyColFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT col);
+
+    /// Reads from C++ stream (option to add to existing contents).
+    /// Throws exception on failure
+    void Read(std::istream &in, bool binary);
+
+    /// Writes to C++ stream (option to write in binary).
+    void Write(std::ostream &Out, bool binary) const;
+
+    friend class VectorBase<double>;
+    friend class VectorBase<float>;
+
+  protected:
+    /// Destructor;  does not deallocate memory, this is handled by child
+    /// classes.
+    /// This destructor is protected so this object can only be
+    /// deleted via a child.
+    ~VectorBase() {}
+
+    /// Empty initializer, corresponds to vector of zero size.
+    explicit VectorBase() : data_(NULL), dim_(0) {
+        KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+    }
+
+    /// data memory area
+    Real *data_;
+    /// dimension of vector
+    MatrixIndexT dim_;
+    KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
+};  // class VectorBase
 
 /** @brief A class representing a vector.
  *
  *  This class provides a way to work with vectors in kaldi.
  *  It encapsulates basic operations and memory optimizations.  */
-template<typename Real>
-class Vector: public VectorBase<Real> {
- public:
-  /// Constructor that takes no arguments.  Initializes to empty.
-  Vector(): VectorBase<Real>() {}
-
-  /// Constructor with specific size.  Sets to all-zero by default
-  /// if set_zero == false, memory contents are undefined.
-  explicit Vector(const MatrixIndexT s,
-                  MatrixResizeType resize_type = kSetZero)
-      : VectorBase<Real>() {  Resize(s, resize_type);  }
-
-  /// Copy constructor from CUDA vector
-  /// This is defined in ../cudamatrix/cu-vector.h
-  //template<typename OtherReal>
-  //explicit Vector(const CuVectorBase<OtherReal> &cu);
-
-  /// Copy constructor.  The need for this is controversial.
-  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
-    Resize(v.Dim(), kUndefined);
-    this->CopyFromVec(v);
-  }
-
-  /// Copy-constructor from base-class, needed to copy from SubVector.
-  explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
-    Resize(v.Dim(), kUndefined);
-    this->CopyFromVec(v);
-  }
-
-  /// Type conversion constructor.
-  template<typename OtherReal>
-  explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
-    Resize(v.Dim(), kUndefined);
-    this->CopyFromVec(v);
-  }
-
-// Took this out since it is unsafe : Arnab
-//  /// Constructor from a pointer and a size; copies the data to a location
-//  /// it owns.
-//  Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
-//    Resize(s);
-  //    CopyFromPtr(Data, s);
-//  }
-
-
-  /// Swaps the contents of *this and *other.  Shallow swap.
-  void Swap(Vector<Real> *other);
-
-  /// Destructor.  Deallocates memory.
-  ~Vector() { Destroy(); }
-
-  /// Read function using C++ streams.  Can also add to existing contents
-  /// of matrix.
-  void Read(std::istream &in, bool binary);
-
-  /// Set vector to a specified size (can be zero).
-  /// The value of the new data depends on resize_type:
-  ///   -if kSetZero, the new data will be zero
-  ///   -if kUndefined, the new data will be undefined
-  ///   -if kCopyData, the new data will be the same as the old data in any
-  ///      shared positions, and zero elsewhere.
-  /// This function takes time proportional to the number of data elements.
-  void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
-
-  /// Remove one element and shifts later elements down.
-  void RemoveElement(MatrixIndexT i);
-
-  /// Assignment operator.
-  Vector<Real> &operator = (const Vector<Real> &other) {
-    Resize(other.Dim(), kUndefined);
-    this->CopyFromVec(other);
-    return *this;
-  }
-
-  /// Assignment operator that takes VectorBase.
-  Vector<Real> &operator = (const VectorBase<Real> &other) {
-    Resize(other.Dim(), kUndefined);
-    this->CopyFromVec(other);
-    return *this;
-  }
- private:
-  /// Init assumes the current contents of the class are invalid (i.e. junk or
-  /// has already been freed), and it sets the vector to newly allocated memory
-  /// with the specified dimension.  dim == 0 is acceptable.  The memory contents
-  /// pointed to by data_ will be undefined.
-  void Init(const MatrixIndexT dim);
-
-  /// Destroy function, called internally.
-  void Destroy();
-
+template <typename Real>
+class Vector : public VectorBase<Real> {
+  public:
+    /// Constructor that takes no arguments.  Initializes to empty.
+    Vector() : VectorBase<Real>() {}
+
+    /// Constructor with specific size.  Sets to all-zero by default
+    /// if set_zero == false, memory contents are undefined.
+    explicit Vector(const MatrixIndexT s,
+                    MatrixResizeType resize_type = kSetZero)
+        : VectorBase<Real>() {
+        Resize(s, resize_type);
+    }
+
+    /// Copy constructor from CUDA vector
+    /// This is defined in ../cudamatrix/cu-vector.h
+    // template<typename OtherReal>
+    // explicit Vector(const CuVectorBase<OtherReal> &cu);
+
+    /// Copy constructor.  The need for this is controversial.
+    Vector(const Vector<Real> &v)
+        : VectorBase<Real>() {  //  (cannot be explicit)
+        Resize(v.Dim(), kUndefined);
+        this->CopyFromVec(v);
+    }
+
+    /// Copy-constructor from base-class, needed to copy from SubVector.
+    explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
+        Resize(v.Dim(), kUndefined);
+        this->CopyFromVec(v);
+    }
+
+    /// Type conversion constructor.
+    template <typename OtherReal>
+    explicit Vector(const VectorBase<OtherReal> &v) : VectorBase<Real>() {
+        Resize(v.Dim(), kUndefined);
+        this->CopyFromVec(v);
+    }
+
+    // Took this out since it is unsafe : Arnab
+    //  /// Constructor from a pointer and a size; copies the data to a location
+    //  /// it owns.
+    //  Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
+    //    Resize(s);
+    //    CopyFromPtr(Data, s);
+    //  }
+
+
+    /// Swaps the contents of *this and *other.  Shallow swap.
+    void Swap(Vector<Real> *other);
+
+    /// Destructor.  Deallocates memory.
+    ~Vector() { Destroy(); }
+
+    /// Read function using C++ streams.  Can also add to existing contents
+    /// of matrix.
+    void Read(std::istream &in, bool binary);
+
+    /// Set vector to a specified size (can be zero).
+    /// The value of the new data depends on resize_type:
+    ///   -if kSetZero, the new data will be zero
+    ///   -if kUndefined, the new data will be undefined
+    ///   -if kCopyData, the new data will be the same as the old data in any
+    ///      shared positions, and zero elsewhere.
+    /// This function takes time proportional to the number of data elements.
+    void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
+
+    /// Remove one element and shifts later elements down.
+    void RemoveElement(MatrixIndexT i);
+
+    /// Assignment operator.
+    Vector<Real> &operator=(const Vector<Real> &other) {
+        Resize(other.Dim(), kUndefined);
+        this->CopyFromVec(other);
+        return *this;
+    }
+
+    /// Assignment operator that takes VectorBase.
+    Vector<Real> &operator=(const VectorBase<Real> &other) {
+        Resize(other.Dim(), kUndefined);
+        this->CopyFromVec(other);
+        return *this;
+    }
+
+  private:
+    /// Init assumes the current contents of the class are invalid (i.e. junk or
+    /// has already been freed), and it sets the vector to newly allocated
+    /// memory
+    /// with the specified dimension.  dim == 0 is acceptable.  The memory
+    /// contents
+    /// pointed to by data_ will be undefined.
+    void Init(const MatrixIndexT dim);
+
+    /// Destroy function, called internally.
+    void Destroy();
 };
 
 
 /// Represents a non-allocating general vector which can be defined
 /// as a sub-vector of higher-level vector [or as the row of a matrix].
-template<typename Real>
+template <typename Real>
 class SubVector : public VectorBase<Real> {
- public:
-  /// Constructor from a Vector or SubVector.
-  /// SubVectors are not const-safe and it's very hard to make them
-  /// so for now we just give up.  This function contains const_cast.
-  SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
-            const MatrixIndexT length) : VectorBase<Real>() {
-    // following assert equiv to origin>=0 && length>=0 &&
-    // origin+length <= rt.dim_
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
-                 static_cast<UnsignedMatrixIndexT>(length) <=
-                 static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
-    VectorBase<Real>::dim_   = length;
-  }
-
-  /// This constructor initializes the vector to point at the contents
-  /// of this packed matrix (SpMatrix or TpMatrix).
- // SubVector(const PackedMatrix<Real> &M) {
-    //VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
-    //VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
-  //}
-
-  /// Copy constructor
-  SubVector(const SubVector &other) : VectorBase<Real> () {
-    // this copy constructor needed for Range() to work in base class.
-    VectorBase<Real>::data_ = other.data_;
-    VectorBase<Real>::dim_ = other.dim_;
-  }
-
-  /// Constructor from a pointer to memory and a length.  Keeps a pointer
-  /// to the data but does not take ownership (will never delete).
-  /// Caution: this constructor enables you to evade const constraints.
-  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
-    VectorBase<Real>::data_ = const_cast<Real*>(data);
-    VectorBase<Real>::dim_   = length;
-  }
-
-  /// This operation does not preserve const-ness, so be careful.
-  SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
-    VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
-    VectorBase<Real>::dim_   = matrix.NumCols();
-  }
-
-  ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
-
- private:
-  /// Disallow assignment operator.
-  SubVector & operator = (const SubVector &other) {}
+  public:
+    /// Constructor from a Vector or SubVector.
+    /// SubVectors are not const-safe and it's very hard to make them
+    /// so for now we just give up.  This function contains const_cast.
+    SubVector(const VectorBase<Real> &t,
+              const MatrixIndexT origin,
+              const MatrixIndexT length)
+        : VectorBase<Real>() {
+        // following assert equiv to origin>=0 && length>=0 &&
+        // origin+length <= rt.dim_
+        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin) +
+                         static_cast<UnsignedMatrixIndexT>(length) <=
+                     static_cast<UnsignedMatrixIndexT>(t.Dim()));
+        VectorBase<Real>::data_ = const_cast<Real *>(t.Data() + origin);
+        VectorBase<Real>::dim_ = length;
+    }
+
+    /// This constructor initializes the vector to point at the contents
+    /// of this packed matrix (SpMatrix or TpMatrix).
+    // SubVector(const PackedMatrix<Real> &M) {
+    // VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
+    // VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+    //}
+
+    /// Copy constructor
+    SubVector(const SubVector &other) : VectorBase<Real>() {
+        // this copy constructor needed for Range() to work in base class.
+        VectorBase<Real>::data_ = other.data_;
+        VectorBase<Real>::dim_ = other.dim_;
+    }
+
+    /// Constructor from a pointer to memory and a length.  Keeps a pointer
+    /// to the data but does not take ownership (will never delete).
+    /// Caution: this constructor enables you to evade const constraints.
+    SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real>() {
+        VectorBase<Real>::data_ = const_cast<Real *>(data);
+        VectorBase<Real>::dim_ = length;
+    }
+
+    /// This operation does not preserve const-ness, so be careful.
+    SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
+        VectorBase<Real>::data_ = const_cast<Real *>(matrix.RowData(row));
+        VectorBase<Real>::dim_ = matrix.NumCols();
+    }
+
+    ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
+
+  private:
+    /// Disallow assignment operator.
+    SubVector &operator=(const SubVector &other) {}
 };
 
 /// @} end of "addtogroup matrix_group"
@@ -303,43 +312,41 @@ class SubVector : public VectorBase<Real> {
 /// @{
 /// Output to a C++ stream.  Non-binary by default (use Write for
 /// binary output).
-template<typename Real>
-std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
+template <typename Real>
+std::ostream &operator<<(std::ostream &out, const VectorBase<Real> &v);
 
 /// Input from a C++ stream.  Will automatically read text or
 /// binary data from the stream.
-template<typename Real>
-std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
+template <typename Real>
+std::istream &operator>>(std::istream &in, VectorBase<Real> &v);
 
 /// Input from a C++ stream. Will automatically read text or
 /// binary data from the stream.
-template<typename Real>
-std::istream & operator >> (std::istream & in, Vector<Real> & v);
+template <typename Real>
+std::istream &operator>>(std::istream &in, Vector<Real> &v);
 /// @} end of \addtogroup matrix_funcs_io
 
 /// \addtogroup matrix_funcs_scalar
 /// @{
 
 
-//template<typename Real>
-//bool ApproxEqual(const VectorBase<Real> &a,
-                 //const VectorBase<Real> &b, Real tol = 0.01) {
-  //return a.ApproxEqual(b, tol);
+// template<typename Real>
+// bool ApproxEqual(const VectorBase<Real> &a,
+// const VectorBase<Real> &b, Real tol = 0.01) {
+// return a.ApproxEqual(b, tol);
 //}
 
-//template<typename Real>
-//inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
-                        //float tol = 0.01) {
-  //KALDI_ASSERT(a.ApproxEqual(b, tol));
+// template<typename Real>
+// inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
+// float tol = 0.01) {
+// KALDI_ASSERT(a.ApproxEqual(b, tol));
 //}
 
 
-
 }  // namespace kaldi
 
 // we need to include the implementation
 #include "matrix/kaldi-vector-inl.h"
 
 
-
 #endif  // KALDI_MATRIX_KALDI_VECTOR_H_
diff --git a/runtime/engine/common/matrix/matrix-common.h b/runtime/engine/common/matrix/matrix-common.h
index b7bdbbc8b..512beb204 100644
--- a/runtime/engine/common/matrix/matrix-common.h
+++ b/runtime/engine/common/matrix/matrix-common.h
@@ -27,52 +27,58 @@
 
 namespace kaldi {
 // this enums equal to CblasTrans and CblasNoTrans constants from CBLAS library
-// we are writing them as literals because we don't want to include here matrix/kaldi-blas.h,
-// which puts many symbols into global scope (like "real") via the header f2c.h 
+// we are writing them as literals because we don't want to include here
+// matrix/kaldi-blas.h,
+// which puts many symbols into global scope (like "real") via the header f2c.h
 typedef enum {
-  kTrans    = 112, // = CblasTrans
-  kNoTrans  = 111  // = CblasNoTrans
+    kTrans = 112,   // = CblasTrans
+    kNoTrans = 111  // = CblasNoTrans
 } MatrixTransposeType;
 
-typedef enum {
-  kSetZero,
-  kUndefined,
-  kCopyData
-} MatrixResizeType;
+typedef enum { kSetZero, kUndefined, kCopyData } MatrixResizeType;
 
 
 typedef enum {
-  kDefaultStride,
-  kStrideEqualNumCols,
+    kDefaultStride,
+    kStrideEqualNumCols,
 } MatrixStrideType;
 
 typedef enum {
-  kTakeLower,
-  kTakeUpper,
-  kTakeMean,
-  kTakeMeanAndCheck
+    kTakeLower,
+    kTakeUpper,
+    kTakeMean,
+    kTakeMeanAndCheck
 } SpCopyType;
 
-template<typename Real> class VectorBase;
-template<typename Real> class Vector;
-template<typename Real> class SubVector;
-template<typename Real> class MatrixBase;
-template<typename Real> class SubMatrix;
-template<typename Real> class Matrix;
+template <typename Real>
+class VectorBase;
+template <typename Real>
+class Vector;
+template <typename Real>
+class SubVector;
+template <typename Real>
+class MatrixBase;
+template <typename Real>
+class SubMatrix;
+template <typename Real>
+class Matrix;
 
 
 /// This class provides a way for switching between double and float types.
-template<typename T> class OtherReal { };  // useful in reading+writing routines
-                                           // to switch double and float.
+template <typename T>
+class OtherReal {};  // useful in reading+writing routines
+                     // to switch double and float.
 /// A specialized class for switching from float to double.
-template<> class OtherReal<float> {
- public:
-  typedef double Real;
+template <>
+class OtherReal<float> {
+  public:
+    typedef double Real;
 };
 /// A specialized class for switching from double to float.
-template<> class OtherReal<double> {
- public:
-  typedef float Real;
+template <>
+class OtherReal<double> {
+  public:
+    typedef float Real;
 };
 
 
@@ -81,12 +87,10 @@ typedef int32 SignedMatrixIndexT;
 typedef uint32 UnsignedMatrixIndexT;
 
 // If you want to use size_t for the index type, do as follows instead:
-//typedef size_t MatrixIndexT;
-//typedef ssize_t SignedMatrixIndexT;
-//typedef size_t UnsignedMatrixIndexT;
-
+// typedef size_t MatrixIndexT;
+// typedef ssize_t SignedMatrixIndexT;
+// typedef size_t UnsignedMatrixIndexT;
 }
 
 
-
 #endif  // KALDI_MATRIX_MATRIX_COMMON_H_
diff --git a/runtime/engine/kaldi/CMakeLists.txt b/runtime/engine/kaldi/CMakeLists.txt
index f9b42e065..e55cecbb2 100644
--- a/runtime/engine/kaldi/CMakeLists.txt
+++ b/runtime/engine/kaldi/CMakeLists.txt
@@ -1,14 +1,15 @@
-project(kaldi)
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 )
 
 add_subdirectory(base)
 add_subdirectory(util)
-add_subdirectory(lat)
-add_subdirectory(fstext)
-add_subdirectory(decoder)
-add_subdirectory(lm)
+if(WITH_ASR)
+    add_subdirectory(lat)
+    add_subdirectory(fstext)
+    add_subdirectory(decoder)
+    add_subdirectory(lm)
 
-add_subdirectory(fstbin)
-add_subdirectory(lmbin)
+    add_subdirectory(fstbin)
+    add_subdirectory(lmbin)
+endif()
diff --git a/runtime/engine/kaldi/base/kaldi-types.h b/runtime/engine/kaldi/base/kaldi-types.h
index c6a3e1aed..f371e3dae 100644
--- a/runtime/engine/kaldi/base/kaldi-types.h
+++ b/runtime/engine/kaldi/base/kaldi-types.h
@@ -44,7 +44,19 @@ typedef float   BaseFloat;
 
 #ifndef COMPILE_WITHOUT_OPENFST
 
+#ifdef WITH_ASR
 #include <fst/types.h>
+#else
+using int8 = int8_t;
+using int16 = int16_t;
+using int32 = int32_t;
+using int64 = int64_t;
+
+using uint8 = uint8_t;
+using uint16 = uint16_t;
+using uint32 = uint32_t;
+using uint64 = uint64_t;
+#endif
 
 namespace kaldi {
   using ::int16;
diff --git a/runtime/engine/vad/CMakeLists.txt b/runtime/engine/vad/CMakeLists.txt
new file mode 100644
index 000000000..d13cc407c
--- /dev/null
+++ b/runtime/engine/vad/CMakeLists.txt
@@ -0,0 +1,18 @@
+# set(CMAKE_CXX_STANDARD 11)
+
+# # 指定下载解压后的fastdeploy库路径
+# set(FASTDEPLOY_INSTALL_DIR "fdlib/fastdeploy-linux-x64-1.0.4" CACHE STRING force)
+
+# if(NOT EXISTS ${FASTDEPLOY_INSTALL_DIR})
+#     message(FATAL_ERROR "Please using cmake -B build -DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR}")
+# endif()
+
+# include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# # 添加FastDeploy依赖头文件
+# include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_onnx_silero_vad ${CMAKE_CURRENT_SOURCE_DIR}/infer_onnx_silero_vad.cc wav.h vad.cc vad.h)
+
+# 添加FastDeploy库依赖
+target_link_libraries(infer_onnx_silero_vad ${FASTDEPLOY_LIBS})
diff --git a/runtime/engine/vad/README.md b/runtime/engine/vad/README.md
new file mode 100644
index 000000000..f032be862
--- /dev/null
+++ b/runtime/engine/vad/README.md
@@ -0,0 +1,121 @@
+English | [简体中文](README_CN.md)
+
+# Silero VAD Deployment Example
+
+This directory provides examples that `infer_onnx_silero_vad` fast finishes the deployment of VAD models on CPU/GPU.
+
+Before deployment, two steps require confirmation.
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).  
+- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
+
+Taking VAD inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
+
+```bash
+mkdir build
+cd build
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# Download the VAD model file and test audio. After decompression, place the model and test audio in the infer_onnx_silero_vad.cc peer directory
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
+
+# inference
+./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
+```
+
+- The above command works for Linux or MacOS. Refer to:
+  - [How to use FastDeploy C++ SDK in Windows](../../../../docs/en/faq/use_sdk_on_windows.md)  for SDK use-pattern in Windows
+
+## VAD C++ Interface
+
+### Vad Class
+
+```c++
+Vad::Vad(const std::string& model_file,
+    const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
+```
+
+**Parameter**
+
+> * **model_file**(str): Model file path
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
+
+### setAudioCofig function
+
+**Must be called before the `init` function**
+
+```c++
+void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
+```
+
+**Parameter**
+
+> * **sr**(int): sampling rate
+> * **frame_ms**(int): The length of each detection frame, and it is used to calculate the detection window size
+> * **threshold**(float): Result probability judgment threshold
+> * **min_silence_duration_ms**(int): The threshold used to calculate whether it is silence
+> * **speech_pad_ms**(int): Used to calculate the end time of the speech
+
+### init function
+
+Used to initialize audio-related parameters.
+
+```c++
+void Vad::init();
+```
+
+### loadAudio function
+
+Load audio.
+
+```c++
+void Vad::loadAudio(const std::string& wavPath)
+```
+
+**Parameter**
+
+> * **wavPath**(str): Audio file path
+
+### Predict function
+
+Used to start model reasoning.
+
+```c++
+bool Vad::Predict();
+```
+
+### getResult function
+
+**Used to obtain reasoning results**
+
+```c++
+std::vector<std::map<std::string, float>> Vad::getResult(
+            float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
+            float mergeThreshold = 0.3);
+```
+
+**Parameter**
+
+> * **removeThreshold**(float): Discard result fragment threshold; If some recognition results are too short, they will be discarded according to this threshold
+> * **expandHeadThreshold**(float): Offset at the beginning of the segment; The recognized start time may be too close to the voice part, so move forward the start time accordingly
+> * **expandTailThreshold**(float): Offset at the end of the segment; The recognized end time may be too close to the voice part, so the end time is moved back accordingly
+> * **mergeThreshold**(float): Some result segments are very close and can be combined into one, and the vocal segments can be combined accordingly
+
+**The output result format is**`std::vector<std::map<std::string, float>>`
+
+> Output a list, each element is a speech fragment
+>
+> Each clip can use 'start' to get the start time and 'end' to get the end time
+
+### Tips
+
+1. `The setAudioCofig`function must be called before the `init` function
+2. The sampling rate of the input audio file must be consistent with that set in the code
+
+- [Model Description](../)
+- [How to switch the model inference backend engine](../../../../docs/en/faq/how_to_change_backend.md)
diff --git a/runtime/engine/vad/README_CN.md b/runtime/engine/vad/README_CN.md
new file mode 100644
index 000000000..c45d9896c
--- /dev/null
+++ b/runtime/engine/vad/README_CN.md
@@ -0,0 +1,119 @@
+[English](README.md) | 简体中文
+# Silero VAD 部署示例
+
+本目录下提供`infer_onnx_silero_vad`快速完成 Silero VAD 模型在CPU/GPU。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上 VAD 推理为例，在本目录执行如下命令即可完成编译测试。
+
+```bash
+mkdir build
+cd build
+# 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# 下载 VAD 模型文件和测试音频，解压后将模型和测试音频放置在与 infer_onnx_silero_vad.cc 同级目录下
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
+
+# 推理
+./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
+```
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## VAD C++ 接口
+### Vad 类
+
+```c++
+Vad::Vad(const std::string& model_file,
+    const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
+```
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+
+### setAudioCofig 函数
+
+**必须在`init`函数前调用**
+
+```c++
+void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
+```
+
+**参数**
+
+> * **sr**(int): 采样率
+> * **frame_ms**(int): 每次检测帧长，用于计算检测窗口大小
+> * **threshold**(float): 结果概率判断阈值
+> * **min_silence_duration_ms**(int): 用于计算判断是否是 silence 的阈值
+> * **speech_pad_ms**(int): 用于计算 speach 结束时刻
+
+### init 函数
+
+用于初始化音频相关参数
+
+```c++
+void Vad::init();
+```
+
+### loadAudio 函数
+
+加载音频
+
+```c++
+void Vad::loadAudio(const std::string& wavPath)
+```
+
+**参数**
+
+> * **wavPath**(str): 音频文件路径
+
+### Predict 函数
+
+用于开始模型推理
+
+```c++
+bool Vad::Predict();
+```
+
+### getResult 函数
+
+**用于获取推理结果**
+
+```c++
+std::vector<std::map<std::string, float>> Vad::getResult(
+            float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
+            float mergeThreshold = 0.3);
+```
+
+**参数**
+
+> * **removeThreshold**(float): 丢弃结果片段阈值；部分识别结果太短则根据此阈值丢弃
+> * **expandHeadThreshold**(float): 结果片段开始时刻偏移；识别到的开始时刻可能过于贴近发声部分，因此据此前移开始时刻
+> * **expandTailThreshold**(float): 结果片段结束时刻偏移；识别到的结束时刻可能过于贴近发声部分，因此据此后移结束时刻
+> * **mergeThreshold**(float): 有的结果片段十分靠近，可以合并成一个，据此合并发声片段
+
+**输出结果格式为**`std::vector<std::map<std::string, float>>`
+
+> 输出一个列表，每个元素是一个讲话片段
+>
+> 每个片段可以用 'start' 获取到开始时刻，用 'end' 获取到结束时刻
+
+### 提示
+
+1. `setAudioCofig`函数必须在`init`函数前调用
+2. 输入的音频文件的采样率必须与代码中设置的保持一致
+
+- [模型介绍](../)
+- [如何切换模型推理后端引擎](../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/runtime/engine/vad/infer_onnx_silero_vad.cc b/runtime/engine/vad/infer_onnx_silero_vad.cc
new file mode 100644
index 000000000..7fb524060
--- /dev/null
+++ b/runtime/engine/vad/infer_onnx_silero_vad.cc
@@ -0,0 +1,65 @@
+
+#include "vad.h"
+
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cout << "Usage: infer_onnx_silero_vad path/to/model path/to/audio "
+                     "run_option, "
+                     "e.g ./infer_onnx_silero_vad silero_vad.onnx sample.wav"
+                  << std::endl;
+        return -1;
+    }
+
+    std::string model_file = argv[1];
+    std::string audio_file = argv[2];
+
+    int sr = 16000;
+    Vad vad(model_file);
+    // custom config, but must be set before init
+    vad.SetConfig(sr, 32, 0.45f, 200, 0, 0);
+    vad.Init();
+
+    std::vector<float> inputWav;  // [0, 1]
+    wav::WavReader wav_reader = wav::WavReader(audio_file);
+    assert(wav_reader.sample_rate() == sr);
+
+
+    auto num_samples = wav_reader.num_samples();
+    inputWav.resize(num_samples);
+    for (int i = 0; i < num_samples; i++) {
+        inputWav[i] = wav_reader.data()[i] / 32768;
+    }
+
+    int window_size_samples = vad.WindowSizeSamples();
+    for (int64_t j = 0; j < num_samples; j += window_size_samples) {
+        auto start = j;
+        auto end = start + window_size_samples >= num_samples
+                       ? num_samples
+                       : start + window_size_samples;
+        auto current_chunk_size = end - start;
+
+        std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
+        assert(r.size() == current_chunk_size);
+
+        if (!vad.ForwardChunk(r)) {
+            std::cerr << "Failed to inference while using model:"
+                      << vad.ModelName() << "." << std::endl;
+            return false;
+        }
+
+        Vad::State s = vad.Postprocess();
+        std::cout << s << " ";
+    }
+    std::cout << std::endl;
+
+    std::vector<std::map<std::string, float>> result = vad.GetResult();
+    for (auto& res : result) {
+        std::cout << "speak start: " << res["start"]
+                  << " s, end: " << res["end"] << " s | ";
+    }
+    std::cout << "\b\b " << std::endl;
+
+    vad.Reset();
+
+    return 0;
+}
diff --git a/runtime/engine/vad/vad.cc b/runtime/engine/vad/vad.cc
new file mode 100644
index 000000000..7630b98df
--- /dev/null
+++ b/runtime/engine/vad/vad.cc
@@ -0,0 +1,306 @@
+// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "vad.h"
+#include <cstring>
+#include <iomanip>
+
+
+#ifdef NDEBUG
+#define LOG_DEBUG                                                              \
+    ::fastdeploy::FDLogger(true, "[DEBUG]") << __REL_FILE__ << "(" << __LINE__ \
+                                            << ")::" << __FUNCTION__ << "\t"
+#else
+#define LOG_DEBUG                            \
+    ::fastdeploy::FDLogger(false, "[DEBUG]") \
+        << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
+#endif
+
+Vad::Vad(const std::string& model_file,
+         const fastdeploy::RuntimeOption&
+             custom_option /* = fastdeploy::RuntimeOption() */) {
+    valid_cpu_backends = {fastdeploy::Backend::ORT,
+                          fastdeploy::Backend::OPENVINO};
+    valid_gpu_backends = {fastdeploy::Backend::ORT, fastdeploy::Backend::TRT};
+
+    runtime_option = custom_option;
+    // ORT backend
+    runtime_option.UseCpu();
+    runtime_option.UseOrtBackend();
+    runtime_option.model_format = fastdeploy::ModelFormat::ONNX;
+    // grap opt level
+    runtime_option.ort_option.graph_optimization_level = 99;
+    // one-thread
+    runtime_option.ort_option.intra_op_num_threads = 1;
+    runtime_option.ort_option.inter_op_num_threads = 1;
+    // model path
+    runtime_option.model_file = model_file;
+}
+
+void Vad::Init() {
+    std::call_once(init_, [&]() { initialized = Initialize(); });
+}
+
+std::string Vad::ModelName() const { return "VAD"; }
+
+void Vad::SetConfig(int sr,
+                    int frame_ms,
+                    float threshold,
+                    int min_silence_duration_ms,
+                    int speech_pad_left_ms,
+                    int speech_pad_right_ms) {
+    if (initialized) {
+        fastdeploy::FDERROR << "SetConfig must be called before init"
+                            << std::endl;
+        throw std::runtime_error("SetConfig must be called before init");
+    }
+    sample_rate_ = sr;
+    sr_per_ms_ = sr / 1000;
+    threshold_ = threshold;
+    frame_ms_ = frame_ms;
+    min_silence_samples_ = min_silence_duration_ms * sr_per_ms_;
+    speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_;
+    speech_pad_right_samples_ = speech_pad_right_ms * sr_per_ms_;
+
+    // init chunk size
+    window_size_samples_ = frame_ms * sr_per_ms_;
+    current_chunk_size_ = window_size_samples_;
+
+    fastdeploy::FDINFO << "sr=" << sr << " threshold=" << threshold
+                       << " frame_ms=" << frame_ms
+                       << " min_silence_duration_ms=" << min_silence_duration_ms
+                       << " speech_pad_left_ms=" << speech_pad_left_ms
+                       << " speech_pad_right_ms=" << speech_pad_right_ms;
+}
+
+void Vad::Reset() {
+    std::memset(h_.data(), 0.0f, h_.size() * sizeof(float));
+    std::memset(c_.data(), 0.0f, c_.size() * sizeof(float));
+
+    triggerd_ = false;
+    temp_end_ = 0;
+    current_sample_ = 0;
+
+    speakStart_.clear();
+    speakEnd_.clear();
+
+    states_.clear();
+}
+
+bool Vad::Initialize() {
+    // input & output holder
+    inputTensors_.resize(4);
+    outputTensors_.resize(3);
+
+    // input shape
+    input_node_dims_.emplace_back(1);
+    input_node_dims_.emplace_back(window_size_samples_);
+    // sr buffer
+    sr_.resize(1);
+    sr_[0] = sample_rate_;
+    // hidden state buffer
+    h_.resize(size_hc_);
+    c_.resize(size_hc_);
+
+    Reset();
+
+    // InitRuntime
+    if (!InitRuntime()) {
+        fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
+                            << std::endl;
+        return false;
+    }
+    fastdeploy::FDINFO << "init done.";
+    return true;
+}
+
+bool Vad::ForwardChunk(std::vector<float>& chunk) {
+    // last chunk may not be window_size_samples_
+    input_node_dims_.back() = chunk.size();
+    assert(window_size_samples_ >= chunk.size());
+    current_chunk_size_ = chunk.size();
+
+    inputTensors_[0].name = "input";
+    inputTensors_[0].SetExternalData(
+        input_node_dims_, fastdeploy::FDDataType::FP32, chunk.data());
+    inputTensors_[1].name = "sr";
+    inputTensors_[1].SetExternalData(
+        sr_node_dims_, fastdeploy::FDDataType::INT64, sr_.data());
+    inputTensors_[2].name = "h";
+    inputTensors_[2].SetExternalData(
+        hc_node_dims_, fastdeploy::FDDataType::FP32, h_.data());
+    inputTensors_[3].name = "c";
+    inputTensors_[3].SetExternalData(
+        hc_node_dims_, fastdeploy::FDDataType::FP32, c_.data());
+
+    if (!Infer(inputTensors_, &outputTensors_)) {
+        return false;
+    }
+
+    // Push forward sample index
+    current_sample_ += current_chunk_size_;
+    return true;
+}
+
+const Vad::State& Vad::Postprocess() {
+    // update prob, h, c
+    outputProb_ = *(float*)outputTensors_[0].Data();
+    auto* hn = static_cast<float*>(outputTensors_[1].MutableData());
+    std::memcpy(h_.data(), hn, h_.size() * sizeof(float));
+    auto* cn = static_cast<float*>(outputTensors_[2].MutableData());
+    std::memcpy(c_.data(), cn, c_.size() * sizeof(float));
+
+    if (outputProb_ < threshold_ && !triggerd_) {
+        // 1. Silence
+        LOG_DEBUG << "{ silence: " << 1.0 * current_sample_ / sample_rate_
+                  << " s; prob: " << outputProb_ << " }";
+        states_.emplace_back(Vad::State::SIL);
+    } else if (outputProb_ >= threshold_ && !triggerd_) {
+        // 2. Start
+        triggerd_ = true;
+        speech_start_ =
+            current_sample_ - current_chunk_size_ - speech_pad_left_samples_;
+        float start_sec = 1.0 * speech_start_ / sample_rate_;
+        speakStart_.emplace_back(start_sec);
+        LOG_DEBUG << "{ speech start: " << start_sec
+                  << " s; prob: " << outputProb_ << " }";
+        states_.emplace_back(Vad::State::START);
+    } else if (outputProb_ >= threshold_ - 0.15 && triggerd_) {
+        // 3. Continue
+
+        if (temp_end_ != 0) {
+            // speech prob relaxation, speech continues again
+            LOG_DEBUG << "{ speech fake end(sil < min_silence_ms) to continue: "
+                      << 1.0 * current_sample_ / sample_rate_
+                      << " s; prob: " << outputProb_ << " }";
+            temp_end_ = 0;
+        } else {
+            // speech prob relaxation, keep tracking speech
+            LOG_DEBUG << "{ speech continue: "
+                      << 1.0 * current_sample_ / sample_rate_
+                      << " s; prob: " << outputProb_ << " }";
+        }
+
+        states_.emplace_back(Vad::State::SPEECH);
+    } else if (outputProb_ < threshold_ - 0.15 && triggerd_) {
+        // 4. End
+        if (temp_end_ == 0) {
+            temp_end_ = current_sample_;
+        }
+
+        // check possible speech end
+        if (current_sample_ - temp_end_ < min_silence_samples_) {
+            // a. silence < min_slience_samples, continue speaking
+            LOG_DEBUG << "{ speech fake end(sil < min_silence_ms): "
+                      << 1.0 * current_sample_ / sample_rate_
+                      << " s; prob: " << outputProb_ << " }";
+            states_.emplace_back(Vad::State::SIL);
+        } else {
+            // b. silence >= min_slience_samples, end speaking
+            speech_end_ = current_sample_ + speech_pad_right_samples_;
+            temp_end_ = 0;
+            triggerd_ = false;
+            auto end_sec = 1.0 * speech_end_ / sample_rate_;
+            speakEnd_.emplace_back(end_sec);
+            LOG_DEBUG << "{ speech end: " << end_sec
+                      << " s; prob: " << outputProb_ << " }";
+            states_.emplace_back(Vad::State::END);
+        }
+    }
+
+    return states_.back();
+}
+
+const std::vector<std::map<std::string, float>> Vad::GetResult(
+    float removeThreshold,
+    float expandHeadThreshold,
+    float expandTailThreshold,
+    float mergeThreshold) const {
+    float audioLength = 1.0 * current_sample_ / sample_rate_;
+    if (speakStart_.empty() && speakEnd_.empty()) {
+        return {};
+    }
+    if (speakEnd_.size() != speakStart_.size()) {
+        // set the audio length as the last end
+        speakEnd_.emplace_back(audioLength);
+    }
+    // Remove too short segments
+    //  auto startIter = speakStart_.begin();
+    //  auto endIter = speakEnd_.begin();
+    //  while (startIter != speakStart_.end()) {
+    //      if (removeThreshold < audioLength &&
+    //          *endIter - *startIter < removeThreshold) {
+    //          startIter = speakStart_.erase(startIter);
+    //          endIter = speakEnd_.erase(endIter);
+    //      } else {
+    //          startIter++;
+    //          endIter++;
+    //      }
+    //  }
+    //  // Expand to avoid to tight cut.
+    //  startIter = speakStart_.begin();
+    //  endIter = speakEnd_.begin();
+    //  *startIter = std::fmax(0.f, *startIter - expandHeadThreshold);
+    //  *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1));
+    //  endIter = speakEnd_.end() - 1;
+    //  startIter = speakStart_.end() - 1;
+    //  *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1));
+    //  *endIter = std::fmin(*endIter + expandTailThreshold, audioLength);
+    //  for (int i = 1; i < speakStart_.size() - 1; ++i) {
+    //      speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold,
+    //      speakEnd_[i - 1]);
+    //      speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold,
+    //      speakStart_[i + 1]);
+    //  }
+    //  // Merge very closed segments
+    //  startIter = speakStart_.begin() + 1;
+    //  endIter = speakEnd_.begin();
+    //  while (startIter != speakStart_.end()) {
+    //      if (*startIter - *endIter < mergeThreshold) {
+    //          startIter = speakStart_.erase(startIter);
+    //          endIter = speakEnd_.erase(endIter);
+    //      } else {
+    //          startIter++;
+    //          endIter++;
+    //      }
+    //  }
+
+    std::vector<std::map<std::string, float>> result;
+    for (int i = 0; i < speakStart_.size(); ++i) {
+        result.emplace_back(std::map<std::string, float>(
+            {{"start", speakStart_[i]}, {"end", speakEnd_[i]}}));
+    }
+    return result;
+}
+
+std::ostream& operator<<(std::ostream& os, const Vad::State& s) {
+    switch (s) {
+        case Vad::State::SIL:
+            os << "[SIL]";
+            break;
+        case Vad::State::START:
+            os << "[STA]";
+            break;
+        case Vad::State::SPEECH:
+            os << "[SPE]";
+            break;
+        case Vad::State::END:
+            os << "[END]";
+            break;
+        default:
+            // illegal state
+            os << "[ILL]";
+            break;
+    }
+    return os;
+}
\ No newline at end of file
diff --git a/runtime/engine/vad/vad.h b/runtime/engine/vad/vad.h
new file mode 100644
index 000000000..6eed7d1c3
--- /dev/null
+++ b/runtime/engine/vad/vad.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <iostream>
+#include <mutex>
+#include <vector>
+#include "./wav.h"
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/runtime.h"
+
+class Vad : public fastdeploy::FastDeployModel {
+  public:
+    enum class State { SIL = 0, START, SPEECH, END };
+    friend std::ostream& operator<<(std::ostream& os, const Vad::State& s);
+
+    Vad(const std::string& model_file,
+        const fastdeploy::RuntimeOption& custom_option =
+            fastdeploy::RuntimeOption());
+
+    void Init();
+
+    void Reset();
+
+    void SetConfig(int sr,
+                   int frame_ms,
+                   float threshold,
+                   int min_silence_duration_ms,
+                   int speech_pad_left_ms,
+                   int speech_pad_right_ms);
+
+    bool ForwardChunk(std::vector<float>& chunk);
+
+    const State& Postprocess();
+
+    const std::vector<std::map<std::string, float>> GetResult(
+        float removeThreshold = 0.0,
+        float expandHeadThreshold = 0.0,
+        float expandTailThreshold = 0,
+        float mergeThreshold = 0.0) const;
+
+    const std::vector<State> GetStates() const { return states_; }
+
+    int SampleRate() const { return sample_rate_; }
+
+    int FrameMs() const { return frame_ms_; }
+    int64_t WindowSizeSamples() const { return window_size_samples_; }
+
+    float Threshold() const { return threshold_; }
+
+    int MinSilenceDurationMs() const {
+        return min_silence_samples_ / sample_rate_;
+    }
+    int SpeechPadLeftMs() const {
+        return speech_pad_left_samples_ / sample_rate_;
+    }
+    int SpeechPadRightMs() const {
+        return speech_pad_right_samples_ / sample_rate_;
+    }
+
+    int MinSilenceSamples() const { return min_silence_samples_; }
+    int SpeechPadLeftSamples() const { return speech_pad_left_samples_; }
+    int SpeechPadRightSamples() const { return speech_pad_right_samples_; }
+
+    std::string ModelName() const override;
+
+  private:
+    bool Initialize();
+
+  private:
+    std::once_flag init_;
+    // input and output
+    std::vector<fastdeploy::FDTensor> inputTensors_;
+    std::vector<fastdeploy::FDTensor> outputTensors_;
+
+    // model states
+    bool triggerd_ = false;
+    unsigned int speech_start_ = 0;
+    unsigned int speech_end_ = 0;
+    unsigned int temp_end_ = 0;
+    unsigned int current_sample_ = 0;
+    unsigned int current_chunk_size_ = 0;
+    // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
+    float outputProb_;
+
+    std::vector<float> speakStart_;
+    mutable std::vector<float> speakEnd_;
+
+    std::vector<State> states_;
+
+    /* ========================================================================
+     */
+    int sample_rate_ = 16000;
+    int frame_ms_ = 32;  // 32, 64, 96 for 16k
+    float threshold_ = 0.5f;
+
+    int64_t window_size_samples_;  // support 256 512 768 for 8k; 512 1024 1536
+                                   // for 16k.
+    int sr_per_ms_;                // support 8 or 16
+    int min_silence_samples_;      // sr_per_ms_ * frame_ms_
+    int speech_pad_left_samples_{0};   // usually 250ms
+    int speech_pad_right_samples_{0};  // usually 0
+
+    /* ========================================================================
+     */
+    std::vector<int64_t> sr_;
+    const size_t size_hc_ = 2 * 1 * 64;  // It's FIXED.
+    std::vector<float> h_;
+    std::vector<float> c_;
+
+    std::vector<int64_t> input_node_dims_;
+    const std::vector<int64_t> sr_node_dims_ = {1};
+    const std::vector<int64_t> hc_node_dims_ = {2, 1, 64};
+};
diff --git a/runtime/engine/vad/wav.h b/runtime/engine/vad/wav.h
new file mode 100644
index 000000000..6d1a6f723
--- /dev/null
+++ b/runtime/engine/vad/wav.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+
+namespace wav {
+
+struct WavHeader {
+    char riff[4];  // "riff"
+    unsigned int size;
+    char wav[4];  // "WAVE"
+    char fmt[4];  // "fmt "
+    unsigned int fmt_size;
+    uint16_t format;
+    uint16_t channels;
+    unsigned int sample_rate;
+    unsigned int bytes_per_second;
+    uint16_t block_size;
+    uint16_t bit;
+    char data[4];  // "data"
+    unsigned int data_size;
+};
+
+class WavReader {
+  public:
+    WavReader() : data_(nullptr) {}
+    explicit WavReader(const std::string& filename) { Open(filename); }
+
+    bool Open(const std::string& filename) {
+        FILE* fp = fopen(filename.c_str(), "rb");
+        if (NULL == fp) {
+            std::cout << "Error in read " << filename;
+            return false;
+        }
+
+        WavHeader header;
+        fread(&header, 1, sizeof(header), fp);
+        if (header.fmt_size < 16) {
+            fprintf(stderr,
+                    "WaveData: expect PCM format data "
+                    "to have fmt chunk of at least size 16.\n");
+            return false;
+        } else if (header.fmt_size > 16) {
+            int offset = 44 - 8 + header.fmt_size - 16;
+            fseek(fp, offset, SEEK_SET);
+            fread(header.data, 8, sizeof(char), fp);
+        }
+        // check "riff" "WAVE" "fmt " "data"
+
+        // Skip any sub-chunks between "fmt" and "data".  Usually there will
+        // be a single "fact" sub chunk, but on Windows there can also be a
+        // "list" sub chunk.
+        while (0 != strncmp(header.data, "data", 4)) {
+            // We will just ignore the data in these chunks.
+            fseek(fp, header.data_size, SEEK_CUR);
+            // read next sub chunk
+            fread(header.data, 8, sizeof(char), fp);
+        }
+
+        num_channel_ = header.channels;
+        sample_rate_ = header.sample_rate;
+        bits_per_sample_ = header.bit;
+        int num_data = header.data_size / (bits_per_sample_ / 8);
+        data_ = new float[num_data];  // Create 1-dim array
+        num_samples_ = num_data / num_channel_;
+
+        for (int i = 0; i < num_data; ++i) {
+            switch (bits_per_sample_) {
+                case 8: {
+                    char sample;
+                    fread(&sample, 1, sizeof(char), fp);
+                    data_[i] = static_cast<float>(sample);
+                    break;
+                }
+                case 16: {
+                    int16_t sample;
+                    fread(&sample, 1, sizeof(int16_t), fp);
+                    // std::cout << sample;
+                    data_[i] = static_cast<float>(sample);
+                    // std::cout << data_[i];
+                    break;
+                }
+                case 32: {
+                    int sample;
+                    fread(&sample, 1, sizeof(int), fp);
+                    data_[i] = static_cast<float>(sample);
+                    break;
+                }
+                default:
+                    fprintf(stderr, "unsupported quantization bits");
+                    exit(1);
+            }
+        }
+        fclose(fp);
+        return true;
+    }
+
+    int num_channel() const { return num_channel_; }
+    int sample_rate() const { return sample_rate_; }
+    int bits_per_sample() const { return bits_per_sample_; }
+    int num_samples() const { return num_samples_; }
+    const float* data() const { return data_; }
+
+  private:
+    int num_channel_;
+    int sample_rate_;
+    int bits_per_sample_;
+    int num_samples_;  // sample points per channel
+    float* data_;
+};
+
+class WavWriter {
+  public:
+    WavWriter(const float* data,
+              int num_samples,
+              int num_channel,
+              int sample_rate,
+              int bits_per_sample)
+        : data_(data),
+          num_samples_(num_samples),
+          num_channel_(num_channel),
+          sample_rate_(sample_rate),
+          bits_per_sample_(bits_per_sample) {}
+
+    void Write(const std::string& filename) {
+        FILE* fp = fopen(filename.c_str(), "w");
+        // init char 'riff' 'WAVE' 'fmt ' 'data'
+        WavHeader header;
+        char wav_header[44] = {
+            0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, 0x41, 0x56,
+            0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+        memcpy(&header, wav_header, sizeof(header));
+        header.channels = num_channel_;
+        header.bit = bits_per_sample_;
+        header.sample_rate = sample_rate_;
+        header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+        header.size = sizeof(header) - 8 + header.data_size;
+        header.bytes_per_second =
+            sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+        header.block_size = num_channel_ * (bits_per_sample_ / 8);
+
+        fwrite(&header, 1, sizeof(header), fp);
+
+        for (int i = 0; i < num_samples_; ++i) {
+            for (int j = 0; j < num_channel_; ++j) {
+                switch (bits_per_sample_) {
+                    case 8: {
+                        char sample =
+                            static_cast<char>(data_[i * num_channel_ + j]);
+                        fwrite(&sample, 1, sizeof(sample), fp);
+                        break;
+                    }
+                    case 16: {
+                        int16_t sample =
+                            static_cast<int16_t>(data_[i * num_channel_ + j]);
+                        fwrite(&sample, 1, sizeof(sample), fp);
+                        break;
+                    }
+                    case 32: {
+                        int sample =
+                            static_cast<int>(data_[i * num_channel_ + j]);
+                        fwrite(&sample, 1, sizeof(sample), fp);
+                        break;
+                    }
+                }
+            }
+        }
+        fclose(fp);
+    }
+
+  private:
+    const float* data_;
+    int num_samples_;  // total float points in data_
+    int num_channel_;
+    int sample_rate_;
+    int bits_per_sample_;
+};
+
+}  // namespace wav