You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/runtime/engine/kaldi/util/text-utils.h

282 lines
11 KiB

// util/text-utils.h
// Copyright 2009-2011 Saarland University; Microsoft Corporation
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_UTIL_TEXT_UTILS_H_
#define KALDI_UTIL_TEXT_UTILS_H_
#include <errno.h>
#include <string>
#include <algorithm>
#include <map>
#include <set>
#include <vector>
#include <limits>
#include "base/kaldi-common.h"
namespace kaldi {
/// Split a string using any of the single character delimiters.
/// If omit_empty_strings == true, the output will contain any
/// nonempty strings after splitting on any of the
/// characters in the delimiter. If omit_empty_strings == false,
/// the output will contain n+1 strings if there are n characters
/// in the set "delim" within the input string. In this case
/// the empty string is split to a single empty string.
void SplitStringToVector(const std::string &full, const char *delim,
bool omit_empty_strings,
std::vector<std::string> *out);
/// Joins the elements of a vector of strings into a single string using
/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings
/// in the vector are skipped. A vector of empty strings results in an empty
/// string on the output.
void JoinVectorToString(const std::vector<std::string> &vec_in,
const char *delim, bool omit_empty_strings,
std::string *str_out);
/**
\brief Split a string (e.g. 1:2:3) into a vector of integers.
\param [in] delim String containing a list of characters, any of which
is allowed as a delimiter.
\param [in] omit_empty_strings If true, empty strings between delimiters are
allowed and will not produce an output integer; if false,
instances of characters in 'delim' that are consecutive or
at the start or end of the string would be an error.
You'll normally want this to be true if 'delim' consists
of spaces, and false otherwise.
\param [out] out The output list of integers.
*/
template<class I>
bool SplitStringToIntegers(const std::string &full,
const char *delim,
bool omit_empty_strings, // typically false [but
// should probably be true
// if "delim" is spaces].
std::vector<I> *out) {
KALDI_ASSERT(out != NULL);
KALDI_ASSERT_IS_INTEGER_TYPE(I);
if (*(full.c_str()) == '\0') {
out->clear();
return true;
}
std::vector<std::string> split;
SplitStringToVector(full, delim, omit_empty_strings, &split);
out->resize(split.size());
for (size_t i = 0; i < split.size(); i++) {
const char *this_str = split[i].c_str();
char *end = NULL;
int64 j = 0;
j = KALDI_STRTOLL(this_str, &end);
if (end == this_str || *end != '\0') {
out->clear();
return false;
} else {
I jI = static_cast<I>(j);
if (static_cast<int64>(jI) != j) {
// output type cannot fit this integer.
out->clear();
return false;
}
(*out)[i] = jI;
}
}
return true;
}
// This is defined for F = float and double.
template<class F>
bool SplitStringToFloats(const std::string &full,
const char *delim,
bool omit_empty_strings, // typically false
std::vector<F> *out);
/// Converts a string into an integer via strtoll and returns false if there was
/// any kind of problem (i.e. the string was not an integer or contained extra
/// non-whitespace junk, or the integer was too large to fit into the type it is
/// being converted into). Only sets *out if everything was OK and it returns
/// true.
template<class Int>
bool ConvertStringToInteger(const std::string &str,
Int *out) {
KALDI_ASSERT_IS_INTEGER_TYPE(Int);
const char *this_str = str.c_str();
char *end = NULL;
errno = 0;
int64 i = KALDI_STRTOLL(this_str, &end);
if (end != this_str)
while (isspace(*end)) end++;
if (end == this_str || *end != '\0' || errno != 0)
return false;
Int iInt = static_cast<Int>(i);
if (static_cast<int64>(iInt) != i ||
(i < 0 && !std::numeric_limits<Int>::is_signed)) {
return false;
}
*out = iInt;
return true;
}
/// ConvertStringToReal converts a string into either float or double
/// and returns false if there was any kind of problem (i.e. the string
/// was not a floating point number or contained extra non-whitespace junk).
/// Be careful- this function will successfully read inf's or nan's.
template <typename T>
bool ConvertStringToReal(const std::string &str,
T *out);
/// Removes the beginning and trailing whitespaces from a string
void Trim(std::string *str);
/// Removes leading and trailing white space from the string, then splits on the
/// first section of whitespace found (if present), putting the part before the
/// whitespace in "first" and the rest in "rest". If there is no such space,
/// everything that remains after removing leading and trailing whitespace goes
/// in "first".
void SplitStringOnFirstSpace(const std::string &line,
std::string *first,
std::string *rest);
/// Returns true if "token" is nonempty, and all characters are
/// printable and whitespace-free.
bool IsToken(const std::string &token);
/// Returns true if "line" is free of \n characters and unprintable
/// characters, and does not contain leading or trailing whitespace.
bool IsLine(const std::string &line);
/**
This function returns true when two text strings are approximately equal, and
false when they are not. The definition of 'equal' is normal string
equality, except that two substrings like "0.31134" and "0.311341" would be
considered equal. 'decimal_places_tolerance' controls how many digits after
the '.' have to match up.
E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would
return false because there is a difference in the 2nd decimal, but with
an argument of 1 it would return true.
*/
bool StringsApproxEqual(const std::string &a,
const std::string &b,
int32 decimal_places_check = 2);
/**
This class is responsible for parsing input like
hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
and giving you access to the fields, in this case
FirstToken() == "hi-there", and key->value pairs:
xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
bing->"a b c", baz->"a b c d='a b' e"
The first token is optional, if the line started with a key-value pair then
FirstValue() will be empty.
Note: it can parse value fields with space inside them only if they are free of the '='
character. If values are going to contain the '=' character, you need to quote them
with either single or double quotes.
Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
*/
class ConfigLine {
public:
// Tries to parse the line as a config-file line. Returns false
// if it could not for some reason, e.g. parsing failure. In most cases
// prints no warnings; the user should do this. Does not expect comments.
bool ParseLine(const std::string &line);
// the GetValue functions are overloaded for various types. They return true
// if the key exists with value that can be converted to that type, and false
// otherwise. They also mark the key-value pair as having been read. It is
// not an error to read values twice.
bool GetValue(const std::string &key, std::string *value);
bool GetValue(const std::string &key, BaseFloat *value);
bool GetValue(const std::string &key, int32 *value);
// Values may be separated by ":" or by ",".
bool GetValue(const std::string &key, std::vector<int32> *value);
bool GetValue(const std::string &key, bool *value);
bool HasUnusedValues() const;
/// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
/// of the GetValue() functions.
std::string UnusedValues() const;
const std::string &FirstToken() const { return first_token_; }
const std::string WholeLine() { return whole_line_; }
// use default assignment operator and copy constructor.
private:
std::string whole_line_;
// the first token of the line, e.g. if line is
// foo-bar baz=bing
// then first_token_ would be "foo-bar".
std::string first_token_;
// data_ maps from key to (value, is-this-value-consumed?).
std::map<std::string, std::pair<std::string, bool> > data_;
};
/// This function is like ExpectToken but for two tokens, and it will either
/// accept token1 and then token2, or just token2. This is useful in Read
/// functions where the first token may already have been consumed.
void ExpectOneOrTwoTokens(std::istream &is, bool binary,
const std::string &token1,
const std::string &token2);
/**
This function reads in a config file and *appends* its contents to a vector of
lines; it is responsible for removing comments (anything after '#') and
stripping out any lines that contain only whitespace after comment removal.
*/
void ReadConfigLines(std::istream &is,
std::vector<std::string> *lines);
/**
This function converts config-lines from a simple sequence of strings
as output by ReadConfigLines(), into a sequence of first-tokens and
name-value pairs. The general format is:
"command-type bar=baz xx=yyy"
etc., although there are subtleties as to what exactly is allowed, see
documentation for class ConfigLine for details.
This function will die if there was a parsing failure.
*/
void ParseConfigLines(const std::vector<std::string> &lines,
std::vector<ConfigLine> *config_lines);
/// Returns true if 'name' would be a valid name for a component or node in a
/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing only
/// '-', '_', '.', A-Z, a-z, or 0-9.
bool IsValidName(const std::string &name);
} // namespace kaldi
#endif // KALDI_UTIL_TEXT_UTILS_H_