pull/673/head
Haoxin Ma 4 years ago
parent 60ac4bc2d8
commit ae566f667b

@ -93,7 +93,29 @@ class AugmentationPipeline():
self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
augmentation_config, 'feature')
def transform_audio(self, audio_segment, single=True):
def randomize_parameters_audio_transform(self):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
:param audio_segment: Audio segment to process.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
for augmentor, rate in zip(self._augmentors, self._rates):
augmentor.randomize_parameters()
def randomize_parameters_feature_transform(self, audio):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
:param audio_segment: Audio segment to process.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
for augmentor, rate in zip(self._augmentors, self._rates):
augmentor.randomize_parameters(audio)
def apply_audio_transform(self, audio_segment):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
@ -103,9 +125,9 @@ class AugmentationPipeline():
"""
for augmentor, rate in zip(self._augmentors, self._rates):
if self._rng.uniform(0., 1.) < rate:
augmentor.transform_audio(audio_segment, single)
augmentor.apply(audio_segment)
def transform_feature(self, spec_segment, single=True):
def apply_feature_transform(self, spec_segment):
"""spectrogram augmentation.
Args:
@ -113,9 +135,32 @@ class AugmentationPipeline():
"""
for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
if self._rng.uniform(0., 1.) < rate:
spec_segment = augmentor.transform_feature(spec_segment, single)
spec_segment = augmentor.apply(spec_segment)
return spec_segment
# def transform_audio(self, audio_segment, single=True):
# """Run the pre-processing pipeline for data augmentation.
# Note that this is an in-place transformation.
# :param audio_segment: Audio segment to process.
# :type audio_segment: AudioSegmenet|SpeechSegment
# """
# for augmentor, rate in zip(self._augmentors, self._rates):
# if self._rng.uniform(0., 1.) < rate:
# augmentor.transform_audio(audio_segment, single)
# def transform_feature(self, spec_segment, single=True):
# """spectrogram augmentation.
# Args:
# spec_segment (np.ndarray): audio feature, (D, T).
# """
# for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
# if self._rng.uniform(0., 1.) < rate:
# spec_segment = augmentor.transform_feature(spec_segment, single)
# return spec_segment
def _parse_pipeline_from(self, config_json, aug_type='audio'):
"""Parse the config json to build a augmentation pipelien."""
assert aug_type in ('audio', 'feature'), aug_type

@ -124,9 +124,9 @@ class SpecAugmentor(AugmentorBase):
def time_warp(xs, W=40):
raise NotImplementedError
def randomize_parameters(self, xs):
n_bins = xs.shape[0]
n_frames = xs.shape[1]
def randomize_parameters(self, n_bins, n_frame):
# n_bins = xs.shape[0]
# n_frames = xs.shape[1]
self.f=[]
self.f_0=[]

@ -215,7 +215,21 @@ class SpeechCollator():
return self._local_data.tar2object[tarpath].extractfile(
self._local_data.tar2info[tarpath][filename])
def process_utterance(self, audio_file, transcript, single=True):
def load_audio(self, audio_file, transcript):
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
speech_segment = SpeechSegment.from_file(
self._subfile_from_tar(audio_file), transcript)
else:
speech_segment = SpeechSegment.from_file(audio_file, transcript)
return speech_segment
def randomize_audio_parameters(self):
self._augmentation_pipeline.andomize_parameters_audio_transform()
def randomize_feature_parameters(self, n_bins, n_frames):
self._augmentation_pipeline.andomize_parameters_feature_transform(n_bins, n_frames)
def process_utterance(self, audio_file, transcript):
"""Load, augment, featurize and normalize for speech data.
:param audio_file: Filepath or file object of audio file.
@ -226,25 +240,56 @@ class SpeechCollator():
where transcription part could be token ids or text.
:rtype: tuple of (2darray, list)
"""
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
speech_segment = SpeechSegment.from_file(
self._subfile_from_tar(audio_file), transcript)
else:
speech_segment = SpeechSegment.from_file(audio_file, transcript)
speech_segment = self.load_audio(audio_file, transcript)
# audio augment
self._augmentation_pipeline.transform_audio(speech_segment)
# apply audio augment
self._augmentation_pipeline.apply_audio_transform(speech_segment)
# Spectrum transform
specgram, transcript_part = self._speech_featurizer.featurize(
speech_segment, self._keep_transcription_text)
if self._normalizer:
specgram = self._normalizer.apply(specgram)
# specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram)
# # apply specgram augment
# specgram = self._augmentation_pipeline.apply_feature_transform(specgram)
return specgram, transcript_part
# def process_utterance(self, audio_file, transcript, single=True):
# """Load, augment, featurize and normalize for speech data.
# :param audio_file: Filepath or file object of audio file.
# :type audio_file: str | file
# :param transcript: Transcription text.
# :type transcript: str
# :return: Tuple of audio feature tensor and data of transcription part,
# where transcription part could be token ids or text.
# :rtype: tuple of (2darray, list)
# """
# if isinstance(audio_file, str) and audio_file.startswith('tar:'):
# speech_segment = SpeechSegment.from_file(
# self._subfile_from_tar(audio_file), transcript)
# else:
# speech_segment = SpeechSegment.from_file(audio_file, transcript)
# # audio augment
# self._augmentation_pipeline.transform_audio(speech_segment)
# # Spectrum transform
# specgram, transcript_part = self._speech_featurizer.featurize(
# speech_segment, self._keep_transcription_text)
# if self._normalizer:
# specgram = self._normalizer.apply(specgram)
# # specgram augment
# specgram = self._augmentation_pipeline.transform_feature(specgram)
# return specgram, transcript_part
def __call__(self, batch):
"""batch examples
@ -269,10 +314,11 @@ class SpeechCollator():
# print(batch)
# print(type(batch))
# print(len(batch))
resample=True
self.randomize_audio_parameters()
for utt, audio, text in batch:
audio, text = self.process_utterance(audio, text, single=resample)
# resample=False
if not self.config.randomize_each_batch:
self.randomize_audio_parameters()
audio, text = self.process_utterance(audio, text)
#utt
utts.append(utt)
# audio
@ -298,6 +344,15 @@ class SpeechCollator():
padded_texts = pad_sequence(
texts, padding_value=IGNORE_ID).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64)
#spec augment
n_bins=padded_audios[0]
self.randomize_feature_parameters(n_bins, min(audio_lens))
for i in range(len(padded_audios)):
if not self.config.randomize_each_batch:
self.randomize_feature_parameters(n_bins, audio_lens[i])
padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
return utts, padded_audios, audio_lens, padded_texts, text_lens
@property

Loading…
Cancel
Save