You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
160 lines
5.4 KiB
160 lines
5.4 KiB
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
|
|
#include "frontend/cmvn.h"
|
|
|
|
#include "utils/file_utils.h"
|
|
#include "utils/picojson.h"
|
|
|
|
namespace ppspeech {
|
|
|
|
using kaldi::BaseFloat;
|
|
using std::unique_ptr;
|
|
using std::vector;
|
|
|
|
|
|
CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
|
|
: var_norm_(true) {
|
|
CHECK_NE(cmvn_file, "");
|
|
base_extractor_ = std::move(base_extractor);
|
|
ReadCMVNFromJson(cmvn_file);
|
|
dim_ = mean_stats_.size() - 1;
|
|
}
|
|
|
|
void CMVN::ReadCMVNFromJson(std::string cmvn_file) {
|
|
std::string json_str = ppspeech::ReadFile2String(cmvn_file);
|
|
picojson::value value;
|
|
std::string err;
|
|
const char* json_end = picojson::parse(
|
|
value, json_str.c_str(), json_str.c_str() + json_str.size(), &err);
|
|
if (!value.is<picojson::object>()) {
|
|
LOG(ERROR) << "Input json file format error.";
|
|
}
|
|
const picojson::value::array& mean_stat =
|
|
value.get("mean_stat").get<picojson::array>();
|
|
for (auto it = mean_stat.begin(); it != mean_stat.end(); it++) {
|
|
mean_stats_.push_back((*it).get<double>());
|
|
}
|
|
|
|
const picojson::value::array& var_stat =
|
|
value.get("var_stat").get<picojson::array>();
|
|
for (auto it = var_stat.begin(); it != var_stat.end(); it++) {
|
|
var_stats_.push_back((*it).get<double>());
|
|
}
|
|
|
|
kaldi::int32 frame_num = value.get("frame_num").get<int64_t>();
|
|
LOG(INFO) << "nframe: " << frame_num;
|
|
mean_stats_.push_back(frame_num);
|
|
var_stats_.push_back(0);
|
|
}
|
|
|
|
void CMVN::Accept(const std::vector<kaldi::BaseFloat>& inputs) {
|
|
// feed waves/feats to compute feature
|
|
base_extractor_->Accept(inputs);
|
|
return;
|
|
}
|
|
|
|
bool CMVN::Read(std::vector<BaseFloat>* feats) {
|
|
// compute feature
|
|
if (base_extractor_->Read(feats) == false || feats->size() == 0) {
|
|
return false;
|
|
}
|
|
|
|
// appply cmvn
|
|
kaldi::Timer timer;
|
|
Compute(feats);
|
|
VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec.";
|
|
return true;
|
|
}
|
|
|
|
// feats contain num_frames feature.
|
|
void CMVN::Compute(vector<BaseFloat>* feats) const {
|
|
KALDI_ASSERT(feats != NULL);
|
|
|
|
if (feats->size() % dim_ != 0) {
|
|
LOG(ERROR) << "Dim mismatch: cmvn " << mean_stats_.size() << ','
|
|
<< var_stats_.size() - 1 << ", feats " << feats->size()
|
|
<< 'x';
|
|
}
|
|
if (var_stats_.size() == 0 && var_norm_) {
|
|
LOG(ERROR)
|
|
<< "You requested variance normalization but no variance stats_ "
|
|
<< "are supplied.";
|
|
}
|
|
|
|
double count = mean_stats_[dim_];
|
|
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
|
|
// computing an offset and representing it as stats_, we use a count of one.
|
|
if (count < 1.0)
|
|
LOG(ERROR) << "Insufficient stats_ for cepstral mean and variance "
|
|
"normalization: "
|
|
<< "count = " << count;
|
|
|
|
if (!var_norm_) {
|
|
vector<BaseFloat> offset(feats->size());
|
|
vector<double> mean_stats(mean_stats_);
|
|
for (size_t i = 0; i < mean_stats.size(); ++i) {
|
|
mean_stats[i] /= count;
|
|
}
|
|
vector<double> mean_stats_apply(feats->size());
|
|
// fill the datat of mean_stats in mean_stats_appy whose dim_ is equal
|
|
// with the dim_ of feature.
|
|
// the dim_ of feats = dim_ * num_frames;
|
|
for (int32 idx = 0; idx < feats->size() / dim_; ++idx) {
|
|
std::memcpy(mean_stats_apply.data() + dim_ * idx,
|
|
mean_stats.data(),
|
|
dim_ * sizeof(double));
|
|
}
|
|
for (size_t idx = 0; idx < feats->size(); ++idx) {
|
|
feats->at(idx) += offset[idx];
|
|
}
|
|
return;
|
|
}
|
|
// norm(0, d) = mean offset;
|
|
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
|
|
vector<BaseFloat> norm0(feats->size());
|
|
vector<BaseFloat> norm1(feats->size());
|
|
for (int32 d = 0; d < dim_; d++) {
|
|
double mean, offset, scale;
|
|
mean = mean_stats_[d] / count;
|
|
double var = (var_stats_[d] / count) - mean * mean, floor = 1.0e-20;
|
|
if (var < floor) {
|
|
LOG(WARNING) << "Flooring cepstral variance from " << var << " to "
|
|
<< floor;
|
|
var = floor;
|
|
}
|
|
scale = 1.0 / sqrt(var);
|
|
if (scale != scale || 1 / scale == 0.0)
|
|
LOG(ERROR)
|
|
<< "NaN or infinity in cepstral mean/variance computation";
|
|
offset = -(mean * scale);
|
|
for (int32 d_skip = d; d_skip < feats->size();) {
|
|
norm0[d_skip] = offset;
|
|
norm1[d_skip] = scale;
|
|
d_skip = d_skip + dim_;
|
|
}
|
|
}
|
|
// Apply the normalization.
|
|
for (size_t idx = 0; idx < feats->size(); ++idx) {
|
|
feats->at(idx) *= norm1[idx];
|
|
}
|
|
|
|
for (size_t idx = 0; idx < feats->size(); ++idx) {
|
|
feats->at(idx) += norm0[idx];
|
|
}
|
|
}
|
|
|
|
} // namespace ppspeech
|