@ -40,15 +40,7 @@ class ManifestDataset(Dataset):
def params ( cls , config : Optional [ CfgNode ] = None ) - > CfgNode :
def params ( cls , config : Optional [ CfgNode ] = None ) - > CfgNode :
default = CfgNode (
default = CfgNode (
dict (
dict (
train_manifest = " " ,
dev_manifest = " " ,
test_manifest = " " ,
manifest = " " ,
manifest = " " ,
unit_type = " char " ,
vocab_filepath = " " ,
spm_model_prefix = " " ,
mean_std_filepath = " " ,
augmentation_config = " " ,
max_input_len = 27.0 ,
max_input_len = 27.0 ,
min_input_len = 0.0 ,
min_input_len = 0.0 ,
max_output_len = float ( ' inf ' ) ,
max_output_len = float ( ' inf ' ) ,
@ -73,25 +65,10 @@ class ManifestDataset(Dataset):
"""
"""
assert ' manifest ' in config . data
assert ' manifest ' in config . data
assert config . data . manifest
assert config . data . manifest
assert ' keep_transcription_text ' in config . collator
if isinstance ( config . data . augmentation_config , ( str , bytes ) ) :
if config . data . augmentation_config :
aug_file = io . open (
config . data . augmentation_config , mode = ' r ' , encoding = ' utf8 ' )
else :
aug_file = io . StringIO ( initial_value = ' {} ' , newline = ' ' )
else :
aug_file = config . data . augmentation_config
assert isinstance ( aug_file , io . StringIO )
dataset = cls (
dataset = cls (
manifest_path = config . data . manifest ,
manifest_path = config . data . manifest ,
unit_type = config . data . unit_type ,
vocab_filepath = config . data . vocab_filepath ,
mean_std_filepath = config . data . mean_std_filepath ,
spm_model_prefix = config . data . spm_model_prefix ,
augmentation_config = aug_file . read ( ) ,
max_input_len = config . data . max_input_len ,
max_input_len = config . data . max_input_len ,
min_input_len = config . data . min_input_len ,
min_input_len = config . data . min_input_len ,
max_output_len = config . data . max_output_len ,
max_output_len = config . data . max_output_len ,
@ -101,23 +78,8 @@ class ManifestDataset(Dataset):
)
)
return dataset
return dataset
def _read_vocab ( self , vocab_filepath ) :
""" Load vocabulary from file. """
vocab_lines = [ ]
with open ( vocab_filepath , ' r ' , encoding = ' utf-8 ' ) as file :
vocab_lines . extend ( file . readlines ( ) )
vocab_list = [ line [ : - 1 ] for line in vocab_lines ]
return vocab_list
def __init__ ( self ,
def __init__ ( self ,
manifest_path ,
manifest_path ,
unit_type ,
vocab_filepath ,
mean_std_filepath ,
spm_model_prefix = None ,
augmentation_config = ' {} ' ,
max_input_len = float ( ' inf ' ) ,
max_input_len = float ( ' inf ' ) ,
min_input_len = 0.0 ,
min_input_len = 0.0 ,
max_output_len = float ( ' inf ' ) ,
max_output_len = float ( ' inf ' ) ,
@ -128,34 +90,16 @@ class ManifestDataset(Dataset):
Args :
Args :
manifest_path ( str ) : manifest josn file path
manifest_path ( str ) : manifest josn file path
unit_type ( str ) : token unit type , e . g . char , word , spm
vocab_filepath ( str ) : vocab file path .
mean_std_filepath ( str ) : mean and std file path , which suffix is * . npy
spm_model_prefix ( str ) : spm model prefix , need if ` unit_type ` is spm .
augmentation_config ( str , optional ) : augmentation json str . Defaults to ' {} ' .
max_input_len ( [ type ] , optional ) : maximum output seq length , in seconds for raw wav , in frame numbers for feature data . Defaults to float ( ' inf ' ) .
max_input_len ( [ type ] , optional ) : maximum output seq length , in seconds for raw wav , in frame numbers for feature data . Defaults to float ( ' inf ' ) .
min_input_len ( float , optional ) : minimum input seq length , in seconds for raw wav , in frame numbers for feature data . Defaults to 0.0 .
min_input_len ( float , optional ) : minimum input seq length , in seconds for raw wav , in frame numbers for feature data . Defaults to 0.0 .
max_output_len ( float , optional ) : maximum input seq length , in modeling units . Defaults to 500.0 .
max_output_len ( float , optional ) : maximum input seq length , in modeling units . Defaults to 500.0 .
min_output_len ( float , optional ) : minimum input seq length , in modeling units . Defaults to 0.0 .
min_output_len ( float , optional ) : minimum input seq length , in modeling units . Defaults to 0.0 .
max_output_input_ratio ( float , optional ) : maximum output seq length / output seq length ratio . Defaults to 10.0 .
max_output_input_ratio ( float , optional ) : maximum output seq length / output seq length ratio . Defaults to 10.0 .
min_output_input_ratio ( float , optional ) : minimum output seq length / output seq length ratio . Defaults to 0.05 .
min_output_input_ratio ( float , optional ) : minimum output seq length / output seq length ratio . Defaults to 0.05 .
stride_ms ( float , optional ) : stride size in ms . Defaults to 10.0 .
window_ms ( float , optional ) : window size in ms . Defaults to 20.0 .
n_fft ( int , optional ) : fft points for rfft . Defaults to None .
max_freq ( int , optional ) : max cut freq . Defaults to None .
target_sample_rate ( int , optional ) : target sample rate which used for training . Defaults to 16000.
specgram_type ( str , optional ) : ' linear ' , ' mfcc ' or ' fbank ' . Defaults to ' linear ' .
feat_dim ( int , optional ) : audio feature dim , using by ' mfcc ' or ' fbank ' . Defaults to None .
delta_delta ( bool , optional ) : audio feature with delta - delta , using by ' fbank ' or ' mfcc ' . Defaults to False .
use_dB_normalization ( bool , optional ) : do dB normalization . Defaults to True .
target_dB ( int , optional ) : target dB . Defaults to - 20.
random_seed ( int , optional ) : for random generator . Defaults to 0.
keep_transcription_text ( bool , optional ) : True , when not in training mode , will not do tokenizer ; Defaults to False .
"""
"""
super ( ) . __init__ ( )
super ( ) . __init__ ( )
# self._rng = np.random.RandomState(random_seed)
# read manifest
# read manifest
self . _manifest = read_manifest (
self . _manifest = read_manifest (
manifest_path = manifest_path ,
manifest_path = manifest_path ,
@ -167,51 +111,6 @@ class ManifestDataset(Dataset):
min_output_input_ratio = min_output_input_ratio )
min_output_input_ratio = min_output_input_ratio )
self . _manifest . sort ( key = lambda x : x [ " feat_shape " ] [ 0 ] )
self . _manifest . sort ( key = lambda x : x [ " feat_shape " ] [ 0 ] )
# self._vocab_list = self._read_vocab(vocab_filepath)
# @property
# def manifest(self):
# return self._manifest
# @property
# def vocab_size(self):
# """Return the vocabulary size.
# Returns:
# int: Vocabulary size.
# """
# return len(self._vocab_list)
# @property
# def vocab_list(self):
# """Return the vocabulary in list.
# Returns:
# List[str]:
# """
# return self._vocab_list
# @property
# def vocab_dict(self):
# """Return the vocabulary in dict.
# Returns:
# Dict[str, int]:
# """
# vocab_dict = dict(
# [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
# return vocab_dict
# @property
# def feature_size(self):
# """Return the audio feature size.
# Returns:
# int: audio feature size.
# """
# return self._manifest[0]["feat_shape"][-1]
def __len__ ( self ) :
def __len__ ( self ) :
return len ( self . _manifest )
return len ( self . _manifest )