refactor audio feat

pull/578/head
Hui Zhang 5 years ago
parent 177f463daa
commit b767486608

@ -307,6 +307,8 @@
" max_freq=config.data.max_freq,\n",
" target_sample_rate=config.data.target_sample_rate,\n",
" specgram_type=config.data.specgram_type,\n",
" feat_dim=config.data.feat_dim,\n",
" delta_delta=config.data.delat_delta,\n",
" use_dB_normalization=config.data.use_dB_normalization,\n",
" target_dB=config.data.target_dB,\n",
" random_seed=config.data.random_seed,\n",

@ -98,6 +98,8 @@ def start_server(config, args):
max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,

@ -50,6 +50,8 @@ def start_server(config, args):
max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,

@ -56,6 +56,8 @@ def tune(config, args):
max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,

@ -32,8 +32,10 @@ _C.data = CN(
window_ms=20.0, # ms
n_fft=None, # fft points
max_freq=None, # None for samplerate/2
specgram_type='linear', # 'linear', 'mfcc'
target_sample_rate=16000, # sample rate
specgram_type='linear', # 'linear', 'mfcc', 'fbank'
feat_dim=0, # 'mfcc', 'fbank'
delat_delta=False, # 'mfcc', 'fbank'
target_sample_rate=16000, # target sample rate
use_dB_normalization=True,
target_dB=-20,
random_seed=0,

@ -163,6 +163,8 @@ class DeepSpeech2Trainer(Trainer):
max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,
@ -183,6 +185,8 @@ class DeepSpeech2Trainer(Trainer):
max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,
@ -378,6 +382,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,

@ -61,7 +61,9 @@ class AudioFeaturizer(object):
use_dB_normalization=True,
target_dB=-20):
self._specgram_type = specgram_type
# mfcc and fbank using `feat_dim`
self._feat_dim = feat_dim
# mfcc and fbank using `delta-delta`
self._delta_delta = delta_delta
self._stride_ms = stride_ms
self._window_ms = window_ms
@ -130,25 +132,28 @@ class AudioFeaturizer(object):
"""Extract various audio features."""
if self._specgram_type == 'linear':
return self._compute_linear_specgram(
samples, sample_rate, self._stride_ms, self._window_ms,
self._max_freq)
samples,
sample_rate,
stride_ms=self._stride_ms,
window_ms=self._window_ms,
max_freq=self._max_freq)
elif self._specgram_type == 'mfcc':
return self._compute_mfcc(
samples,
sample_rate,
self._stride_ms,
self._feat_dim,
self._window_ms,
self._max_freq,
feat_dim=self._feat_dim,
stride_ms=self._stride_ms,
window_ms=self._window_ms,
max_freq=self._max_freq,
delta_delta=self._delta_delta)
elif self._specgram_type == 'fbank':
return self._compute_fbank(
samples,
sample_rate,
self._stride_ms,
self._feat_dim,
self._window_ms,
self._max_freq,
feat_dim=self._feat_dim,
stride_ms=self._stride_ms,
window_ms=self._window_ms,
max_freq=self._max_freq,
delta_delta=self._delta_delta)
else:
raise ValueError("Unknown specgram_type %s. "
@ -323,10 +328,9 @@ class AudioFeaturizer(object):
winstep=0.001 * stride_ms,
nfilt=feat_dim,
nfft=512,
lowfreq=max_freq,
highfreq=None,
preemph=0.97,
winfunc=lambda x: np.ones((x, )))
lowfreq=0,
highfreq=max_freq,
preemph=0.97,)
fbank_feat = np.transpose(fbank_feat)
if delta_delta:
fbank_feat = self._concat_delta_delta(fbank_feat)

@ -56,8 +56,8 @@ class SpeechFeaturizer(object):
vocab_filepath,
spm_model_prefix=None,
specgram_type='linear',
feat_dim=13,
delta_delta=True,
feat_dim=None,
delta_delta=False,
stride_ms=10.0,
window_ms=20.0,
n_fft=None,

@ -43,6 +43,15 @@ class TextFeaturizer(object):
self.sp = spm.SentencePieceProcessor()
self.sp.Load(spm_model)
def tokenize(self, text):
if self.unit_type == 'char':
tokens = self.char_tokenize(text)
elif self.unit_type == 'word':
tokens = self.word_tokenize(text)
else: # spm
tokens = self.spm_tokenize(text)
return tokens
def featurize(self, text):
"""Convert text string to a list of token indices in char-level.Note
that the token indexing order follows the given vocabulary file.
@ -52,13 +61,7 @@ class TextFeaturizer(object):
:return: List of char-level token indices.
:rtype: List[int]
"""
if self.unit_type == 'char':
tokens = self.char_tokenize(text)
elif self.unit_type == 'word':
tokens = self.word_tokenize(text)
else:
tokens = self.spm_tokenize(text)
tokens = self.tokenize(text)
ids = []
for token in tokens:
token = token if token in self._vocab_dict else self.unk

@ -55,6 +55,8 @@ def create_dataloader(manifest_path,
window_ms=window_ms,
max_freq=max_freq,
specgram_type=specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
use_dB_normalization=use_dB_normalization,
random_seed=random_seed,
keep_transcription_text=keep_transcription_text)

@ -51,6 +51,8 @@ class ManifestDataset(Dataset):
max_freq=None,
target_sample_rate=16000,
specgram_type='linear',
feat_dim=None,
delta_delta=False,
use_dB_normalization=True,
target_dB=-20,
random_seed=0,
@ -71,7 +73,9 @@ class ManifestDataset(Dataset):
n_fft (int, optional): fft points for rfft. Defaults to None.
max_freq (int, optional): max cut freq. Defaults to None.
target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
specgram_type (str, optional): 'linear' or 'mfcc'. Defaults to 'linear'.
specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
target_dB (int, optional): target dB. Defaults to -20.
random_seed (int, optional): for random generator. Defaults to 0.
@ -89,6 +93,8 @@ class ManifestDataset(Dataset):
vocab_filepath=vocab_filepath,
spm_model_prefix=spm_model_prefix,
specgram_type=specgram_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
stride_ms=stride_ms,
window_ms=window_ms,
n_fft=n_fft,

@ -40,7 +40,9 @@ fi
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \
--specgram_type="linear" \
--specgram_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--output_path="data/mean_std.npz"
if [ $? -ne 0 ]; then

@ -54,17 +54,13 @@ add_arg('spm_model_prefix', str, "spm_model_%(spm_mode)_%(count_threshold)",
args = parser.parse_args()
def count_manifest(counter, manifest_path):
def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
if args.unit_type == 'char':
for char in line_json['text']:
counter.update(char)
elif args.unit_type == 'word':
for word in line_json['text'].split():
counter.update(word)
def read_text_manifest(fileobj, manifest_path):
line = text_feature.tokenize(line_json['text'])
counter.update(line)
def dump_text_manifest(fileobj, manifest_path):
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
fileobj.write(line_json['text'] + "\n")
@ -77,9 +73,11 @@ def main():
fout.write(UNK + '\n') # <unk> must be 1
if args.unit_type != 'spm':
text_feature = TextFeaturizer(args.unit_type, args.vocab_path)
counter = Counter()
for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path)
count_manifest(counter, text_feature, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
for char, count in count_sorted:
@ -93,7 +91,7 @@ def main():
fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
for manifest_path in args.manifest_paths:
read_text_manifest(fp, manifest_path)
dump_text_manifest(fp, manifest_path)
fp.close()
# train
spm.SentencePieceTrainer.Train(
@ -108,20 +106,29 @@ def main():
# encode
text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
vocabs = set()
# vocabs = set()
# for manifest_path in args.manifest_paths:
# manifest_jsons = read_manifest(manifest_path)
# for line_json in manifest_jsons:
# line = line_json['text']
# enc_line = text_feature.spm_tokenize(line)
# for code in enc_line:
# vocabs.add(code)
# #print(" ".join(enc_line))
# vocabs_sorted = sorted(vocabs)
# for unit in vocabs_sorted:
# fout.write(unit + "\n")
counter = Counter()
for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
line = line_json['text']
enc_line = text_feature.spm_tokenize(line)
for code in enc_line:
vocabs.add(code)
#print(" ".join(enc_line))
vocabs_sorted = sorted(vocabs)
for unit in vocabs_sorted:
fout.write(unit + "\n")
print(f"spm vocab size: {len(vocabs_sorted)}")
count_manifest(counter, text_feature, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
for token, count in count_sorted:
fout.write(token + '\n')
print(f"spm vocab size: {len(count_sorted)}")
fout.write(SOS + "\n") # <sos/eos>
fout.close()

@ -28,12 +28,13 @@ add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int,
13,
"Audio feature dim.")
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool,
False,
"Audio feature with delta delta.")
add_arg('stride_ms', float, 10.0, "stride length in ms.")
add_arg('window_ms', float, 20.0, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
@ -51,7 +52,14 @@ def main():
audio_featurizer = AudioFeaturizer(
specgram_type=args.specgram_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta)
delta_delta=args.delta_delta,
stride_ms=args.stride_ms,
window_ms=args.window_ms,
n_fft=None,
max_freq=None,
target_sample_rate=args.sample_rate,
use_dB_normalization=True,
target_dB=-20)
def augment_and_featurize(audio_segment):
augmentation_pipeline.transform_audio(audio_segment)

Loading…
Cancel
Save