You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
472 lines
18 KiB
472 lines
18 KiB
3 years ago
|
// util/kaldi-table.h
|
||
|
|
||
|
// Copyright 2009-2011 Microsoft Corporation
|
||
|
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||
|
|
||
|
// See ../../COPYING for clarification regarding multiple authors
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||
|
// See the Apache 2 License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
#ifndef KALDI_UTIL_KALDI_TABLE_H_
|
||
|
#define KALDI_UTIL_KALDI_TABLE_H_
|
||
|
|
||
|
#include <string>
|
||
|
#include <vector>
|
||
|
#include <utility>
|
||
|
|
||
|
#include "base/kaldi-common.h"
|
||
|
#include "util/kaldi-holder.h"
|
||
|
|
||
|
namespace kaldi {
|
||
|
|
||
|
// Forward declarations
|
||
|
template<class Holder> class RandomAccessTableReaderImplBase;
|
||
|
template<class Holder> class SequentialTableReaderImplBase;
|
||
|
template<class Holder> class TableWriterImplBase;
|
||
|
|
||
|
/// \addtogroup table_group
|
||
|
/// @{
|
||
|
|
||
|
// This header defines the Table classes (RandomAccessTableReader,
|
||
|
// SequentialTableReader and TableWriter) and explains what the Holder classes,
|
||
|
// which the Table class requires as a template argument, are like. It also
|
||
|
// explains the "rspecifier" and "wspecifier" concepts (these are strings that
|
||
|
// explain how to read/write objects via archives or scp files. A table is
|
||
|
// conceptually a collection of objects of a particular type T indexed by keys
|
||
|
// of type std::string (these Keys additionally have an order within
|
||
|
// each table).
|
||
|
// The Table classes are templated on a type (call it Holder) such that
|
||
|
// Holder::T is a typedef equal to T.
|
||
|
|
||
|
// see kaldi-holder.h for detail on the Holder classes.
|
||
|
|
||
|
typedef std::vector<std::string> KeyList;
|
||
|
|
||
|
// Documentation for "wspecifier"
|
||
|
// "wspecifier" describes how we write a set of objects indexed by keys.
|
||
|
// The basic, unadorned wspecifiers are as follows:
|
||
|
//
|
||
|
// ark:wxfilename
|
||
|
// scp:rxfilename
|
||
|
// ark,scp:filename,wxfilename
|
||
|
// ark,scp:filename,wxfilename
|
||
|
//
|
||
|
//
|
||
|
// We also allow the following modifiers:
|
||
|
// t means text mode.
|
||
|
// b means binary mode.
|
||
|
// f means flush the stream after writing each entry.
|
||
|
// (nf means don't flush, and the default is not to flush).
|
||
|
// p means permissive mode, when writing to an "scp" file only: will ignore
|
||
|
// missing scp entries, i.e. won't write anything for those files but will
|
||
|
// return success status).
|
||
|
//
|
||
|
// So the following are valid wspecifiers:
|
||
|
// ark,b,f:foo
|
||
|
// "ark,b,b:| gzip -c > foo"
|
||
|
// "ark,scp,t,nf:foo.ark,|gzip -c > foo.scp.gz"
|
||
|
// ark,b:-
|
||
|
//
|
||
|
// The meanings of rxfilename and wxfilename are as described in
|
||
|
// kaldi-io.h (they are filenames but include pipes, stdin/stdout
|
||
|
// and so on; filename is a regular filename.
|
||
|
//
|
||
|
|
||
|
// The ark:wxfilename type of wspecifier instructs the class to
|
||
|
// write directly to an archive. For small objects (e.g. lists of ints),
|
||
|
// the text archive format will generally be human readable with one line
|
||
|
// per entry in the archive.
|
||
|
//
|
||
|
// The type "scp:xfilename" refers to an scp file which should
|
||
|
// already exist on disk, and tells us where to write the data for
|
||
|
// each key (usually an actual file); each line of the scp file
|
||
|
// would be:
|
||
|
// key xfilename
|
||
|
//
|
||
|
// The type ark,scp:filename,wxfilename means
|
||
|
// we write both an archive and an scp file that specifies offsets into the
|
||
|
// archive, with lines like:
|
||
|
// key filename:12407
|
||
|
// where the number is the byte offset into the file.
|
||
|
// In this case we restrict the archive-filename to be an actual filename,
|
||
|
// as we can't see a situation where an extended filename would make sense
|
||
|
// for this (we can't fseek() in pipes).
|
||
|
|
||
|
enum WspecifierType {
|
||
|
kNoWspecifier,
|
||
|
kArchiveWspecifier,
|
||
|
kScriptWspecifier,
|
||
|
kBothWspecifier
|
||
|
};
|
||
|
|
||
|
struct WspecifierOptions {
|
||
|
bool binary;
|
||
|
bool flush;
|
||
|
bool permissive; // will ignore absent scp entries.
|
||
|
WspecifierOptions(): binary(true), flush(false), permissive(false) { }
|
||
|
};
|
||
|
|
||
|
// ClassifyWspecifier returns the type of the wspecifier string,
|
||
|
// and (if pointers are non-NULL) outputs the extra information
|
||
|
// about the options, and the script and archive
|
||
|
// filenames.
|
||
|
WspecifierType ClassifyWspecifier(const std::string &wspecifier,
|
||
|
std::string *archive_wxfilename,
|
||
|
std::string *script_wxfilename,
|
||
|
WspecifierOptions *opts);
|
||
|
|
||
|
// ReadScriptFile reads an .scp file in its entirety, and appends it
|
||
|
// (in order as it was in the scp file) in script_out_, which contains
|
||
|
// pairs of (key, xfilename). The .scp
|
||
|
// file format is: on each line, key xfilename
|
||
|
// where xfilename means rxfilename or wxfilename, and may contain internal
|
||
|
// spaces (we trim away any leading or trailing space). The key is space-free.
|
||
|
// ReadScriptFile returns true if the format was valid (empty files
|
||
|
// are valid).
|
||
|
// If 'print_warnings', it will print out warning messages that explain what
|
||
|
// kind of error there was.
|
||
|
bool ReadScriptFile(const std::string &rxfilename,
|
||
|
bool print_warnings,
|
||
|
std::vector<std::pair<std::string, std::string> >
|
||
|
*script_out);
|
||
|
|
||
|
// This version of ReadScriptFile works from an istream.
|
||
|
bool ReadScriptFile(std::istream &is,
|
||
|
bool print_warnings,
|
||
|
std::vector<std::pair<std::string, std::string> >
|
||
|
*script_out);
|
||
|
|
||
|
// Writes, for each entry in script, the first element, then ' ', then the
|
||
|
// second element then '\n'. Checks that the keys (first elements of pairs) are
|
||
|
// valid tokens (nonempty, no whitespace), and the values (second elements of
|
||
|
// pairs) are newline-free and contain no leading or trailing space. Returns
|
||
|
// true on success.
|
||
|
bool WriteScriptFile(const std::string &wxfilename,
|
||
|
const std::vector<std::pair<std::string, std::string> >
|
||
|
&script);
|
||
|
|
||
|
// This version writes to an ostream.
|
||
|
bool WriteScriptFile(std::ostream &os,
|
||
|
const std::vector<std::pair<std::string, std::string> >
|
||
|
&script);
|
||
|
|
||
|
// Documentation for "rspecifier"
|
||
|
// "rspecifier" describes how we read a set of objects indexed by keys.
|
||
|
// The possibilities are:
|
||
|
//
|
||
|
// ark:rxfilename
|
||
|
// scp:rxfilename
|
||
|
//
|
||
|
// We also allow various modifiers:
|
||
|
// o means the program will only ask for each key once, which enables
|
||
|
// the reader to discard already-asked-for values.
|
||
|
// s means the keys are sorted on input (means we don't have to read till
|
||
|
// eof if someone asked for a key that wasn't there).
|
||
|
// cs means that it is called in sorted order (we are generally asserting
|
||
|
// this based on knowledge of how the program works).
|
||
|
// p means "permissive", and causes it to skip over keys whose corresponding
|
||
|
// scp-file entries cannot be read. [and to ignore errors in archives and
|
||
|
// script files, and just consider the "good" entries].
|
||
|
// We allow the negation of the options above, as in no, ns, np,
|
||
|
// but these aren't currently very useful (just equivalent to omitting the
|
||
|
// corresponding option).
|
||
|
// [any of the above options can be prefixed by n to negate them, e.g. no,
|
||
|
// ns, ncs, np; but these aren't currently useful as you could just omit
|
||
|
// the option].
|
||
|
// bg means "background". It currently has no effect for random-access readers,
|
||
|
// but for sequential readers it will cause it to "read ahead" to the next
|
||
|
// value, in a background thread. Recommended when reading larger objects
|
||
|
// such as neural-net training examples, especially when you want to
|
||
|
// maximize GPU usage.
|
||
|
//
|
||
|
// b is ignored [for scripting convenience]
|
||
|
// t is ignored [for scripting convenience]
|
||
|
//
|
||
|
//
|
||
|
// So for instance the following would be a valid rspecifier:
|
||
|
//
|
||
|
// "o, s, p, ark:gunzip -c foo.gz|"
|
||
|
|
||
|
struct RspecifierOptions {
|
||
|
// These options only make a difference for the RandomAccessTableReader class.
|
||
|
bool once; // we assert that the program will only ask for each key once.
|
||
|
bool sorted; // we assert that the keys are sorted.
|
||
|
bool called_sorted; // we assert that the (HasKey(), Value() functions will
|
||
|
// also be called in sorted order. [this implies "once" but not vice versa].
|
||
|
bool permissive; // If "permissive", when reading from scp files it treats
|
||
|
// scp files that can't be read as if the corresponding key were not there.
|
||
|
// For archive files it will suppress errors getting thrown if the archive
|
||
|
// is corrupted and can't be read to the end.
|
||
|
bool background; // For sequential readers, if the background option ("bg")
|
||
|
// is provided, it will read ahead to the next object in a
|
||
|
// background thread.
|
||
|
RspecifierOptions(): once(false), sorted(false),
|
||
|
called_sorted(false), permissive(false),
|
||
|
background(false) { }
|
||
|
};
|
||
|
|
||
|
enum RspecifierType {
|
||
|
kNoRspecifier,
|
||
|
kArchiveRspecifier,
|
||
|
kScriptRspecifier
|
||
|
};
|
||
|
|
||
|
RspecifierType ClassifyRspecifier(const std::string &rspecifier,
|
||
|
std::string *rxfilename,
|
||
|
RspecifierOptions *opts);
|
||
|
|
||
|
|
||
|
/// Allows random access to a collection
|
||
|
/// of objects in an archive or script file; see \ref io_sec_tables.
|
||
|
template<class Holder>
|
||
|
class RandomAccessTableReader {
|
||
|
public:
|
||
|
typedef typename Holder::T T;
|
||
|
|
||
|
RandomAccessTableReader(): impl_(NULL) { }
|
||
|
|
||
|
// This constructor is equivalent to default constructor + "open", but
|
||
|
// throws on error.
|
||
|
explicit RandomAccessTableReader(const std::string &rspecifier);
|
||
|
|
||
|
// Opens the table.
|
||
|
bool Open(const std::string &rspecifier);
|
||
|
|
||
|
// Returns true if table is open.
|
||
|
bool IsOpen() const { return (impl_ != NULL); }
|
||
|
|
||
|
// Close() will close the table [throws if it was not open],
|
||
|
// and returns true on success (false if we were reading an
|
||
|
// archive and we discovered an error in the archive).
|
||
|
bool Close();
|
||
|
|
||
|
// Says if it has this key.
|
||
|
// If you are using the "permissive" (p) read option,
|
||
|
// it will return false for keys whose corresponding entry
|
||
|
// in the scp file cannot be read.
|
||
|
|
||
|
bool HasKey(const std::string &key);
|
||
|
|
||
|
// Value() may throw if you are reading an scp file, you
|
||
|
// do not have the "permissive" (p) option, and an entry
|
||
|
// in the scp file cannot be read. Typically you won't
|
||
|
// want to catch this error.
|
||
|
const T &Value(const std::string &key);
|
||
|
|
||
|
~RandomAccessTableReader();
|
||
|
|
||
|
// Allow copy-constructor only for non-opened readers (needed for inclusion in
|
||
|
// stl vector)
|
||
|
RandomAccessTableReader(const RandomAccessTableReader<Holder>
|
||
|
&other):
|
||
|
impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
|
||
|
private:
|
||
|
// Disallow assignment.
|
||
|
RandomAccessTableReader &operator=(const RandomAccessTableReader<Holder>&);
|
||
|
void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
|
||
|
// message and dies (with KALDI_ERR) if NULL.
|
||
|
RandomAccessTableReaderImplBase<Holder> *impl_;
|
||
|
};
|
||
|
|
||
|
|
||
|
|
||
|
/// A templated class for reading objects sequentially from an archive or script
|
||
|
/// file; see \ref io_sec_tables.
|
||
|
template<class Holder>
|
||
|
class SequentialTableReader {
|
||
|
public:
|
||
|
typedef typename Holder::T T;
|
||
|
|
||
|
SequentialTableReader(): impl_(NULL) { }
|
||
|
|
||
|
// This constructor equivalent to default constructor + "open", but
|
||
|
// throws on error.
|
||
|
explicit SequentialTableReader(const std::string &rspecifier);
|
||
|
|
||
|
// Opens the table. Returns exit status; but does throw if previously open
|
||
|
// stream was in error state. You can call Close to prevent this; anyway,
|
||
|
// calling Open more than once is not usually needed.
|
||
|
bool Open(const std::string &rspecifier);
|
||
|
|
||
|
// Returns true if we're done. It will also return true if there's some kind
|
||
|
// of error and we can't read any more; in this case, you can detect the
|
||
|
// error by calling Close and checking the return status; otherwise
|
||
|
// the destructor will throw.
|
||
|
inline bool Done();
|
||
|
|
||
|
// Only valid to call Key() if Done() returned false.
|
||
|
inline std::string Key();
|
||
|
|
||
|
// FreeCurrent() is provided as an optimization to save memory, for large
|
||
|
// objects. It instructs the class to deallocate the current value. The
|
||
|
// reference Value() will be invalidated by this.
|
||
|
void FreeCurrent();
|
||
|
|
||
|
// Return reference to the current value. It's only valid to call this if
|
||
|
// Done() returned false. The reference is valid till next call to this
|
||
|
// object. It will throw if you are reading an scp file, did not specify the
|
||
|
// "permissive" (p) option and the file cannot be read. [The permissive
|
||
|
// option makes it behave as if that key does not even exist, if the
|
||
|
// corresponding file cannot be read.] You probably wouldn't want to catch
|
||
|
// this exception; the user can just specify the p option in the rspecifier.
|
||
|
// We make this non-const to enable things like shallow swap on the held
|
||
|
// object in situations where this would avoid making a redundant copy.
|
||
|
T &Value();
|
||
|
|
||
|
// Next goes to the next key. It will not throw; any error will
|
||
|
// result in Done() returning true, and then the destructor will
|
||
|
// throw unless you call Close().
|
||
|
void Next();
|
||
|
|
||
|
// Returns true if table is open for reading (does not imply
|
||
|
// stream is in good state).
|
||
|
bool IsOpen() const;
|
||
|
|
||
|
// Close() will return false (failure) if Done() became true
|
||
|
// because of an error/ condition rather than because we are
|
||
|
// really done [e.g. because of an error or early termination
|
||
|
// in the archive].
|
||
|
// If there is an error and you don't call Close(), the destructor
|
||
|
// will fail.
|
||
|
// Close()
|
||
|
bool Close();
|
||
|
|
||
|
// The destructor may throw. This is the desired behaviour, as it's the way
|
||
|
// we signal the error to the user (to detect it, call Close(). The issue is
|
||
|
// that otherwise the user has no way to tell whether Done() returned true
|
||
|
// because we reached the end of the archive or script, or because there was
|
||
|
// an error that prevented further reading.
|
||
|
~SequentialTableReader();
|
||
|
|
||
|
// Allow copy-constructor only for non-opened readers (needed for inclusion in
|
||
|
// stl vector)
|
||
|
SequentialTableReader(const SequentialTableReader<Holder> &other):
|
||
|
impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
|
||
|
private:
|
||
|
// Disallow assignment.
|
||
|
SequentialTableReader &operator = (const SequentialTableReader<Holder>&);
|
||
|
void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
|
||
|
// message and dies (with KALDI_ERR) if NULL.
|
||
|
SequentialTableReaderImplBase<Holder> *impl_;
|
||
|
};
|
||
|
|
||
|
|
||
|
/// A templated class for writing objects to an
|
||
|
/// archive or script file; see \ref io_sec_tables.
|
||
|
template<class Holder>
|
||
|
class TableWriter {
|
||
|
public:
|
||
|
typedef typename Holder::T T;
|
||
|
|
||
|
TableWriter(): impl_(NULL) { }
|
||
|
|
||
|
// This constructor equivalent to default constructor
|
||
|
// + "open", but throws on error. See docs for
|
||
|
// wspecifier above.
|
||
|
explicit TableWriter(const std::string &wspecifier);
|
||
|
|
||
|
// Opens the table. See docs for wspecifier above.
|
||
|
// If it returns true, it is open.
|
||
|
bool Open(const std::string &wspecifier);
|
||
|
|
||
|
// Returns true if open for writing.
|
||
|
bool IsOpen() const;
|
||
|
|
||
|
// Write the object. Throws KaldiFatalError on error via the KALDI_ERR macro.
|
||
|
inline void Write(const std::string &key, const T &value) const;
|
||
|
|
||
|
|
||
|
// Flush will flush any archive; it does not return error status
|
||
|
// or throw, any errors will be reported on the next Write or Close.
|
||
|
// Useful if we may be writing to a command in a pipe and want
|
||
|
// to ensure good CPU utilization.
|
||
|
void Flush();
|
||
|
|
||
|
// Close() is not necessary to call, as the destructor
|
||
|
// closes it; it's mainly useful if you want to handle
|
||
|
// error states because the destructor will throw on
|
||
|
// error if you do not call Close().
|
||
|
bool Close();
|
||
|
|
||
|
~TableWriter();
|
||
|
|
||
|
// Allow copy-constructor only for non-opened writers (needed for inclusion in
|
||
|
// stl vector)
|
||
|
TableWriter(const TableWriter &other): impl_(NULL) {
|
||
|
KALDI_ASSERT(other.impl_ == NULL);
|
||
|
}
|
||
|
private:
|
||
|
TableWriter &operator = (const TableWriter&); // Disallow assignment.
|
||
|
|
||
|
void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
|
||
|
// message and dies (with KALDI_ERR) if NULL.
|
||
|
TableWriterImplBase<Holder> *impl_;
|
||
|
};
|
||
|
|
||
|
|
||
|
/// This class is for when you are reading something in random access, but
|
||
|
/// it may actually be stored per-speaker (or something similar) but the
|
||
|
/// keys you're using are per utterance. So you also provide an "rxfilename"
|
||
|
/// for a file containing lines like
|
||
|
/// utt1 spk1
|
||
|
/// utt2 spk1
|
||
|
/// utt3 spk1
|
||
|
/// and so on. Note: this is optional; if it is an empty string, we just won't
|
||
|
/// do the mapping. Also, "table_rxfilename" may be the empty string (as for
|
||
|
/// a regular table), in which case the table just won't be opened.
|
||
|
/// We provide only the most frequently used of the functions of
|
||
|
/// RandomAccessTableReader.
|
||
|
|
||
|
template<class Holder>
|
||
|
class RandomAccessTableReaderMapped {
|
||
|
public:
|
||
|
typedef typename Holder::T T;
|
||
|
/// Note: "utt2spk_rxfilename" will in the normal case be an rxfilename
|
||
|
/// for an utterance to speaker map, but this code is general; it accepts
|
||
|
/// a generic map.
|
||
|
RandomAccessTableReaderMapped(const std::string &table_rxfilename,
|
||
|
const std::string &utt2spk_rxfilename);
|
||
|
|
||
|
RandomAccessTableReaderMapped() {}
|
||
|
|
||
|
/// Note: when calling Open, utt2spk_rxfilename may be empty.
|
||
|
bool Open(const std::string &table_rxfilename,
|
||
|
const std::string &utt2spk_rxfilename);
|
||
|
|
||
|
bool HasKey(const std::string &key);
|
||
|
const T &Value(const std::string &key);
|
||
|
inline bool IsOpen() const { return reader_.IsOpen(); }
|
||
|
inline bool Close() { return reader_.Close(); }
|
||
|
|
||
|
|
||
|
|
||
|
// The default copy-constructor will do what we want: it will crash for
|
||
|
// already-opened readers, by calling the member-variable copy-constructors.
|
||
|
private:
|
||
|
// Disallow assignment.
|
||
|
RandomAccessTableReaderMapped &operator =
|
||
|
(const RandomAccessTableReaderMapped<Holder>&);
|
||
|
RandomAccessTableReader<Holder> reader_;
|
||
|
RandomAccessTableReader<TokenHolder> token_reader_;
|
||
|
std::string utt2spk_rxfilename_; // Used only in diagnostic messages.
|
||
|
};
|
||
|
|
||
|
|
||
|
/// @} end "addtogroup table_group"
|
||
|
} // end namespace kaldi
|
||
|
|
||
|
#include "util/kaldi-table-inl.h"
|
||
|
|
||
|
#endif // KALDI_UTIL_KALDI_TABLE_H_
|