From d59b8ca97e36a41ecda84e2079fa0bad585230dc Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 22 May 2017 21:00:37 +0800 Subject: [PATCH 01/55] Add deep_speech_2 folder. --- README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 00000000..a0990367 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +TBD From 3fc94427db7395a1b7f9ab1013ca32218830a101 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 25 May 2017 01:17:18 +0800 Subject: [PATCH 02/55] Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. --- README.md | 8 +- audio_data_utils.py | 159 +++++++++++++++++++++++++++++++++++++ eng_vocab.txt | 28 +++++++ librispeech.py | 97 +++++++++++++++++++++++ requirements.sh | 5 ++ train.py | 188 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 484 insertions(+), 1 deletion(-) create mode 100644 audio_data_utils.py create mode 100644 eng_vocab.txt create mode 100644 librispeech.py create mode 100644 requirements.sh create mode 100644 train.py diff --git a/README.md b/README.md index a0990367..fcadf568 100644 --- a/README.md +++ b/README.md @@ -1 +1,7 @@ -TBD +# Deep Speech 2 on PaddlePaddle + +``` +sh requirements.sh +python librispeech.py +python train.py +``` diff --git a/audio_data_utils.py b/audio_data_utils.py new file mode 100644 index 00000000..2f7bfcf7 --- /dev/null +++ b/audio_data_utils.py @@ -0,0 +1,159 @@ +import paddle.v2 as paddle +import logging +import json +import random +import soundfile +import numpy as np +import os + +# TODO: add z-score normalization. + +ENGLISH_CHAR_VOCAB_FILEPATH = "eng_vocab.txt" + +logger = logging.getLogger(__name__) + + +def spectrogram_from_file(filename, + stride_ms=10, + window_ms=20, + max_freq=None, + eps=1e-14): + """ + Calculate the log of linear spectrogram from FFT energy + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + audio, sample_rate = soundfile.read(filename) + if audio.ndim >= 2: + audio = np.mean(audio, 1) + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + spectrogram, freqs = extract_spectrogram( + audio, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.log(spectrogram[:ind, :] + eps) + + +def extract_spectrogram(samples, window_size, stride_size, sample_rate): + """ + Compute the spectrogram for a real discrete signal. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, compute squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + fft = np.fft.rfft(windows * weighting, axis=0) + fft = np.absolute(fft)**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs + + +def vocabulary_from_file(vocabulary_path): + """ + Load vocabulary from file. + """ + if os.path.exists(vocabulary_path): + vocab_lines = [] + with open(vocabulary_path, 'r') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list + else: + raise ValueError("Vocabulary file %s not found.", vocabulary_path) + + +def get_vocabulary_size(): + vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) + return len(vocab_dict) + + +def parse_transcript(text, vocabulary): + """ + Convert the transcript text string to list of token index integers.. + """ + return [vocabulary[w] for w in text] + + +def reader_creator(manifest_path, + sort_by_duration=True, + shuffle=False, + max_duration=10.0, + min_duration=0.0): + if sort_by_duration and shuffle: + sort_by_duration = False + logger.warn("When shuffle set to true, " + "sort_by_duration is forced to set False.") + vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) + + def reader(): + # read manifest + manifest_data = [] + for json_line in open(manifest_path): + try: + json_data = json.loads(json_line) + except Exception as e: + raise ValueError("Error reading manifest: %s" % str(e)) + if (json_data["duration"] <= max_duration and + json_data["duration"] >= min_duration): + manifest_data.append(json_data) + # sort (by duration) or shuffle manifest + if sort_by_duration: + manifest_data.sort(key=lambda x: x["duration"]) + if shuffle: + random.shuffle(manifest_data) + # extract spectrogram feature + for instance in manifest_data: + spectrogram = spectrogram_from_file(instance["audio_filepath"]) + text = parse_transcript(instance["text"], vocab_dict) + yield (spectrogram, text) + + return reader + + +def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True): + def padding_batch(batch): + new_batch = [] + # get target shape within batch + nshape_list = [padding] + for audio, text in batch: + nshape_list.append(audio.shape) + target_shape = np.array(nshape_list).max(axis=0) + # padding + for audio, text in batch: + pad_shape = target_shape - audio.shape + assert np.all(pad_shape >= 0) + padded_audio = np.pad( + audio, [(0, pad_shape[0]), (0, pad_shape[1])], mode="constant") + if flatten: + padded_audio = padded_audio.flatten() + new_batch.append((padded_audio, text)) + return new_batch + + def new_batch_reader(): + for batch in batch_reader(): + yield padding_batch(batch) + + return new_batch_reader diff --git a/eng_vocab.txt b/eng_vocab.txt new file mode 100644 index 00000000..8268f3f3 --- /dev/null +++ b/eng_vocab.txt @@ -0,0 +1,28 @@ +' + +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z diff --git a/librispeech.py b/librispeech.py new file mode 100644 index 00000000..fc7b9822 --- /dev/null +++ b/librispeech.py @@ -0,0 +1,97 @@ +import paddle.v2 as paddle +import os +import wget +import tarfile +import argparse +import soundfile +import json + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_TEST = "http://www.openslr.org/resources/12/test-clean.tar.gz" +URL_DEV = "http://www.openslr.org/resources/12/dev-clean.tar.gz" +URL_TRAIN = "http://www.openslr.org/resources/12/train-clean-100.tar.gz" + +parser = argparse.ArgumentParser( + description='Downloads and prepare LibriSpeech dataset.') +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Libri", + type=str, + help="Directory to save the dataset.") +parser.add_argument( + "--manifest", + default="./libri.manifest", + type=str, + help="Filepath prefix of output manifests.") +args = parser.parse_args() + + +def download(url, target_dir): + if not os.path.exists(target_dir): + os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not os.path.exists(filepath): + print("Downloading %s ..." % url) + wget.download(url, target_dir) + print("") + return filepath + + +def unpack(filepath, target_dir): + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + return target_dir + + +def create_manifest(data_dir, manifest_path): + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in os.walk(data_dir): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(data_dir, subfolder, text_filelist[0]) + for line in open(text_filepath): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(data_dir, subfolder, + segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with open(manifest_path, 'w') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, target_dir, manifest_path): + filepath = download(url, target_dir) + unpacked_dir = unpack(filepath, target_dir) + create_manifest(unpacked_dir, manifest_path) + + +def main(): + prepare_dataset( + url=URL_TEST, + target_dir=os.path.join(args.target_dir), + manifest_path=args.manifest + ".test") + prepare_dataset( + url=URL_DEV, + target_dir=os.path.join(args.target_dir), + manifest_path=args.manifest + ".dev") + #prepare_dataset(url=URL_TRAIN, +#target_dir=os.path.join(args.target_dir), +#manifest_path=args.manifest + ".train") + + +if __name__ == '__main__': + main() diff --git a/requirements.sh b/requirements.sh new file mode 100644 index 00000000..7a089169 --- /dev/null +++ b/requirements.sh @@ -0,0 +1,5 @@ +pip install wget +pip install soundfile + +# For Linux only +apt-get install libsndfile1 diff --git a/train.py b/train.py new file mode 100644 index 00000000..083a718d --- /dev/null +++ b/train.py @@ -0,0 +1,188 @@ +import paddle.v2 as paddle +import audio_data_utils +import argparse + +parser = argparse.ArgumentParser( + description='Simpled version of DeepSpeech2 trainer.') +parser.add_argument( + "--batch_size", default=512, type=int, help="Minibatch size.") +parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") +parser.add_argument( + "--num_passes", default=20, type=int, help="Training pass number.") +args = parser.parse_args() + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectonal_simple_rnn_bn_layer(name, input, size, act): + def __simple_rnn_step__(input): + last_state = paddle.layer.memory(name=name + "_state", size=size) + input_fc = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + input_fc_bn = paddle.layer.batch_norm( + input=input_fc, act=paddle.activation.Linear()) + state_fc = paddle.layer.fc( + input=last_state, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.addto( + name=name + "_state", input=[input_fc_bn, state_fc], act=act) + + forward = paddle.layer.recurrent_group( + step=__simple_rnn_step__, input=input) + return forward + # argument reverse is not exposed in V2 recurrent_group + #backward = paddle.layer.recurrent_group( + + +#step=__simple_rnn_step__, +#input=input, +#reverse=True) +#return paddle.layer.concat(input=[forward, backward]) + + +def conv_group(input): + conv1 = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + conv2 = conv_bn_layer( + input=conv1, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + conv3 = conv_bn_layer( + input=conv2, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + return conv3 + + +def rnn_group(input, size, num_stacks): + output = input + for i in xrange(num_stacks): + output = bidirectonal_simple_rnn_bn_layer( + name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + return output + + +def deep_speech2(audio_data, text_data, dict_size): + conv_group_output = conv_group(input=audio_data) + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=32, + stride_x=1, + stride_y=1, + block_x=1, + block_y=21) + rnn_group_output = rnn_group(input=conv2seq, size=256, num_stacks=5) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + cost = paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) + return cost + + +def train(): + # create network config + dict_size = audio_data_utils.get_vocabulary_size() + audio_data = paddle.layer.data( + name="audio_spectrogram", + height=161, + width=1000, + type=paddle.data_type.dense_vector(161000)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(dict_size)) + cost = deep_speech2(audio_data, text_data, dict_size) + + # create parameters and optimizer + parameters = paddle.parameters.create(cost) + optimizer = paddle.optimizer.Adam( + learning_rate=5e-5, + gradient_clipping_threshold=5, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + return + + # create data readers + feeding = { + "audio_spectrogram": 0, + "transcript_text": 1, + } + train_batch_reader = audio_data_utils.padding_batch_reader( + paddle.batch( + audio_data_utils.reader_creator("./libri.manifest.dev"), + batch_size=args.batch_size // args.trainer), + padding=[-1, 1000]) + test_batch_reader = audio_data_utils.padding_batch_reader( + paddle.batch( + audio_data_utils.reader_creator("./libri.manifest.test"), + batch_size=args.batch_size // args.trainer), + padding=[-1, 1000]) + + # create event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 10 == 0: + print "Pass: %d, Batch: %d, TrainCost: %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=test_batch_reader, feeding=feeding) + print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost, + result.metrics) + with gzip.open("params.tar.gz", 'w') as f: + parameters.to_tar(f) + + # run train + trainer.train( + reader=train_batch_reader, + event_handler=event_handler, + num_passes=10, + feeding=feeding) + + +def main(): + train() + + +if __name__ == '__main__': + main() From 70a343a4991c13120589e7419fa5c3c8551c190d Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 25 May 2017 16:10:23 +0800 Subject: [PATCH 03/55] Add infererence and add SortaGrad for only first pass. --- README.md | 2 + audio_data_utils.py | 4 ++ infer.py | 94 +++++++++++++++++++++++++++ librispeech.py | 2 +- model.py | 106 ++++++++++++++++++++++++++++++ requirements.sh | 2 +- train.py | 152 ++++++++++++-------------------------------- 7 files changed, 248 insertions(+), 114 deletions(-) create mode 100644 infer.py create mode 100644 model.py diff --git a/README.md b/README.md index fcadf568..1f7e0384 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,5 @@ sh requirements.sh python librispeech.py python train.py ``` + +Please add warp-ctc library path (usually $PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib) to LD_LIBRARY_PATH. diff --git a/audio_data_utils.py b/audio_data_utils.py index 2f7bfcf7..6dedfbf9 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -90,6 +90,10 @@ def get_vocabulary_size(): return len(vocab_dict) +def get_vocabulary(): + return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) + + def parse_transcript(text, vocabulary): """ Convert the transcript text string to list of token index integers.. diff --git a/infer.py b/infer.py new file mode 100644 index 00000000..7b16c838 --- /dev/null +++ b/infer.py @@ -0,0 +1,94 @@ +import paddle.v2 as paddle +import audio_data_utils +import argparse +from model import deep_speech2 +import gzip +from itertools import groupby + +parser = argparse.ArgumentParser( + description='Simpled version of DeepSpeech2 inference.') +parser.add_argument( + "--num_samples", default=10, type=int, help="Number of inference samples.") +parser.add_argument( + "--num_conv_layers", default=2, type=int, help="Convolution layer number.") +parser.add_argument( + "--num_rnn_layers", default=3, type=int, help="RNN layer number.") +parser.add_argument( + "--rnn_layer_size", default=512, type=int, help="RNN layer cell number.") +parser.add_argument( + "--use_gpu", default=True, type=bool, help="Use gpu or not.") +args = parser.parse_args() + + +def remove_duplicate_and_blank(id_list, blank_id): + # remove consecutive duplicate tokens + id_list = [x[0] for x in groupby(id_list)] + # remove blank + return [id for id in id_list if id != blank_id] + + +def max_infer(): + # create network config + _, vocab_list = audio_data_utils.get_vocabulary() + dict_size = len(vocab_list) + audio_data = paddle.layer.data( + name="audio_spectrogram", + height=161, + width=1000, + type=paddle.data_type.dense_vector(161000)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(dict_size)) + _, max_id = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=dict_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar( + gzip.open("params.tar.gz")) + + # prepare infer data + feeding = { + "audio_spectrogram": 0, + "transcript_text": 1, + } + test_batch_reader = audio_data_utils.padding_batch_reader( + paddle.batch( + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.test", sort_by_duration=False), + batch_size=args.num_samples), + padding=[-1, 1000]) + infer_data = test_batch_reader().next() + + # run inference + max_id_results = paddle.infer( + output_layer=max_id, + parameters=parameters, + input=infer_data, + field=['id']) + + # postprocess + instance_length = len(max_id_results) / args.num_samples + instance_list = [ + max_id_results[i:i + instance_length] + for i in xrange(0, args.num_samples) + ] + for i, instance in enumerate(instance_list): + id_list = remove_duplicate_and_blank(instance, dict_size) + output_transcript = ''.join([vocab_list[id] for id in id_list]) + target_transcript = ''.join([vocab_list[id] for id in infer_data[i][1]]) + print("Target Transcript: %s \nOutput Transcript: %s \n" % + (target_transcript, output_transcript)) + + +def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=1) + max_infer() + + +if __name__ == '__main__': + main() diff --git a/librispeech.py b/librispeech.py index fc7b9822..0d82e19f 100644 --- a/librispeech.py +++ b/librispeech.py @@ -23,7 +23,7 @@ parser.add_argument( "--manifest", default="./libri.manifest", type=str, - help="Filepath prefix of output manifests.") + help="Filepath prefix for output manifests.") args = parser.parse_args() diff --git a/model.py b/model.py new file mode 100644 index 00000000..67bee5f7 --- /dev/null +++ b/model.py @@ -0,0 +1,106 @@ +import paddle.v2 as paddle + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectonal_simple_rnn_bn_layer(name, input, size, act): + def __simple_rnn_step__(input): + last_state = paddle.layer.memory(name=name + "_state", size=size) + input_fc = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + input_fc_bn = paddle.layer.batch_norm( + input=input_fc, act=paddle.activation.Linear()) + state_fc = paddle.layer.fc( + input=last_state, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.addto( + name=name + "_state", input=[input_fc_bn, state_fc], act=act) + + forward = paddle.layer.recurrent_group( + step=__simple_rnn_step__, input=input) + return forward + # argument reverse is not exposed in V2 recurrent_group + #backward = paddle.layer.recurrent_group( + + +#step=__simple_rnn_step__, +#input=input, +#reverse=True) +#return paddle.layer.concat(input=[forward, backward]) + + +def conv_group(input, num_stacks): + conv = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + for i in xrange(num_stacks - 1): + conv = conv_bn_layer( + input=conv, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + return conv + + +def rnn_group(input, size, num_stacks): + output = input + for i in xrange(num_stacks): + output = bidirectonal_simple_rnn_bn_layer( + name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + return output + + +def deep_speech2(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256): + conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers) + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=32, + stride_x=1, + stride_y=1, + block_x=1, + block_y=21) + rnn_group_output = rnn_group( + input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + cost = paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) + max_id = paddle.layer.max_id(input=fc) + return cost, max_id diff --git a/requirements.sh b/requirements.sh index 7a089169..bb1f261d 100644 --- a/requirements.sh +++ b/requirements.sh @@ -1,5 +1,5 @@ pip install wget pip install soundfile -# For Linux only +# For Ubuntu only apt-get install libsndfile1 diff --git a/train.py b/train.py index 083a718d..64be4033 100644 --- a/train.py +++ b/train.py @@ -1,6 +1,8 @@ import paddle.v2 as paddle import audio_data_utils import argparse +from model import deep_speech2 +import gzip parser = argparse.ArgumentParser( description='Simpled version of DeepSpeech2 trainer.') @@ -9,114 +11,19 @@ parser.add_argument( parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument( "--num_passes", default=20, type=int, help="Training pass number.") +parser.add_argument( + "--num_conv_layers", default=2, type=int, help="Convolution layer number.") +parser.add_argument( + "--num_rnn_layers", default=3, type=int, help="RNN layer number.") +parser.add_argument( + "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") +parser.add_argument( + "--use_gpu", default=True, type=bool, help="Use gpu or not.") +parser.add_argument( + "--trainer_count", default=8, type=int, help="Trainer number.") args = parser.parse_args() -def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, - padding, act): - conv_layer = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=num_channels_in, - num_filters=num_channels_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=conv_layer, act=act) - - -def bidirectonal_simple_rnn_bn_layer(name, input, size, act): - def __simple_rnn_step__(input): - last_state = paddle.layer.memory(name=name + "_state", size=size) - input_fc = paddle.layer.fc( - input=input, - size=size, - act=paddle.activation.Linear(), - bias_attr=False) - input_fc_bn = paddle.layer.batch_norm( - input=input_fc, act=paddle.activation.Linear()) - state_fc = paddle.layer.fc( - input=last_state, - size=size, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.addto( - name=name + "_state", input=[input_fc_bn, state_fc], act=act) - - forward = paddle.layer.recurrent_group( - step=__simple_rnn_step__, input=input) - return forward - # argument reverse is not exposed in V2 recurrent_group - #backward = paddle.layer.recurrent_group( - - -#step=__simple_rnn_step__, -#input=input, -#reverse=True) -#return paddle.layer.concat(input=[forward, backward]) - - -def conv_group(input): - conv1 = conv_bn_layer( - input=input, - filter_size=(11, 41), - num_channels_in=1, - num_channels_out=32, - stride=(3, 2), - padding=(5, 20), - act=paddle.activation.BRelu()) - conv2 = conv_bn_layer( - input=conv1, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - conv3 = conv_bn_layer( - input=conv2, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - return conv3 - - -def rnn_group(input, size, num_stacks): - output = input - for i in xrange(num_stacks): - output = bidirectonal_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) - return output - - -def deep_speech2(audio_data, text_data, dict_size): - conv_group_output = conv_group(input=audio_data) - conv2seq = paddle.layer.block_expand( - input=conv_group_output, - num_channels=32, - stride_x=1, - stride_y=1, - block_x=1, - block_y=21) - rnn_group_output = rnn_group(input=conv2seq, size=256, num_stacks=5) - fc = paddle.layer.fc( - input=rnn_group_output, - size=dict_size + 1, - act=paddle.activation.Linear(), - bias_attr=True) - cost = paddle.layer.warp_ctc( - input=fc, - label=text_data, - size=dict_size + 1, - blank=dict_size, - norm_by_times=True) - return cost - - def train(): # create network config dict_size = audio_data_utils.get_vocabulary_size() @@ -128,7 +35,13 @@ def train(): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) - cost = deep_speech2(audio_data, text_data, dict_size) + cost, _ = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=dict_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size) # create parameters and optimizer parameters = paddle.parameters.create(cost) @@ -138,21 +51,30 @@ def train(): regularization=paddle.optimizer.L2Regularization(rate=8e-4)) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) - return # create data readers feeding = { "audio_spectrogram": 0, "transcript_text": 1, } - train_batch_reader = audio_data_utils.padding_batch_reader( + train_batch_reader_with_sortagrad = audio_data_utils.padding_batch_reader( paddle.batch( - audio_data_utils.reader_creator("./libri.manifest.dev"), + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.dev", sort_by_duration=True), + batch_size=args.batch_size // args.trainer), + padding=[-1, 1000]) + train_batch_reader_without_sortagrad = audio_data_utils.padding_batch_reader( + paddle.batch( + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.dev", + sort_by_duration=False, + shuffle=True), batch_size=args.batch_size // args.trainer), padding=[-1, 1000]) test_batch_reader = audio_data_utils.padding_batch_reader( paddle.batch( - audio_data_utils.reader_creator("./libri.manifest.test"), + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.test", sort_by_duration=False), batch_size=args.batch_size // args.trainer), padding=[-1, 1000]) @@ -174,13 +96,19 @@ def train(): # run train trainer.train( - reader=train_batch_reader, + reader=train_batch_reader_with_sortagrad, + event_handler=event_handler, + num_passes=1, + feeding=feeding) + trainer.train( + reader=train_batch_reader_without_sortagrad, event_handler=event_handler, - num_passes=10, + num_passes=self.num_passes - 1, feeding=feeding) def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train() From 0babc5c4d73a3fab976a46d49e473c556e946f7a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 25 May 2017 23:36:06 +0800 Subject: [PATCH 04/55] Add function docs. --- audio_data_utils.py | 56 +++++++++++++++++++++++++++++++++++++++++++-- infer.py | 29 +++++++++++++++++------ librispeech.py | 15 +++++++++--- model.py | 49 +++++++++++++++++++++++++++++++++++++++ train.py | 23 +++++++++++++------ 5 files changed, 153 insertions(+), 19 deletions(-) diff --git a/audio_data_utils.py b/audio_data_utils.py index 6dedfbf9..a3a397e9 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -1,3 +1,6 @@ +""" + Audio data preprocessing tools and reader creators. +""" import paddle.v2 as paddle import logging import json @@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path): def get_vocabulary_size(): + """ + Get vocabulary size. + """ vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) return len(vocab_dict) def get_vocabulary(): + """ + Get vocabulary. + """ return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) def parse_transcript(text, vocabulary): """ - Convert the transcript text string to list of token index integers.. - """ + Convert the transcript text string to list of token index integers. + """ return [vocabulary[w] for w in text] @@ -106,6 +115,28 @@ def reader_creator(manifest_path, shuffle=False, max_duration=10.0, min_duration=0.0): + """ + Audio data reader creator. + + Instance: a tuple of a numpy ndarray of audio spectrogram and a list of + tokenized transcription text. + + :param manifest_path: Filepath for Manifest of audio clip files. + :type manifest_path: basestring + :param sort_by_duration: Sort the audio clips by duration if set True. + For SortaGrad. + :type sort_by_duration: bool + :param shuffle: Shuffle the audio clips if set True. + :type shuffle: bool + :param max_duration: Audio clips with duration (in seconds) greater than + this will be discarded. + :type max_duration: float + :param min_duration: Audio clips with duration (in seconds) smaller than + this will be discarded. + :type min_duration: float + :return: Data reader function. + :rtype: callable + """ if sort_by_duration and shuffle: sort_by_duration = False logger.warn("When shuffle set to true, " @@ -138,6 +169,27 @@ def reader_creator(manifest_path, def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True): + """ + Padding for batches. Return a batch reader. + + Each instance in a batch will be padded to be of a same target shape. + The target shape is the largest shape among all the batch instances and + 'padding' argument. Therefore, if padding is set [-1, -1], instance will be + padded to have the same shape just within each batch and the shape will + be different across batches; if padding is set + [VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to + have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM]. + + :param batch_reader: Input batch reader. + :type batch_reader: callable + :param padding: Padding pattern. Details please refer to the above. + :type padding: list + :param flatten: Flatten the tensor to be one dimension. + :type flatten: bool + :return: Batch reader function. + :rtype: callable + """ + def padding_batch(batch): new_batch = [] # get target shape within batch diff --git a/infer.py b/infer.py index 7b16c838..1f13956e 100644 --- a/infer.py +++ b/infer.py @@ -1,14 +1,21 @@ +""" + Inference for a simplifed version of Baidu DeepSpeech2 model. +""" + import paddle.v2 as paddle -import audio_data_utils +from itertools import groupby import argparse -from model import deep_speech2 import gzip -from itertools import groupby +import audio_data_utils +from model import deep_speech2 parser = argparse.ArgumentParser( - description='Simpled version of DeepSpeech2 inference.') + description='Simplified version of DeepSpeech2 inference.') parser.add_argument( - "--num_samples", default=10, type=int, help="Number of inference samples.") + "--num_samples", + default=10, + type=int, + help="Number of samples for inference.") parser.add_argument( "--num_conv_layers", default=2, type=int, help="Convolution layer number.") parser.add_argument( @@ -21,13 +28,21 @@ args = parser.parse_args() def remove_duplicate_and_blank(id_list, blank_id): + """ + Postprocessing for max-ctc-decoder. + - remove consecutive duplicate tokens. + - remove blanks. + """ # remove consecutive duplicate tokens id_list = [x[0] for x in groupby(id_list)] - # remove blank + # remove blanks return [id for id in id_list if id != blank_id] def max_infer(): + """ + Max-ctc-decoding for DeepSpeech2. + """ # create network config _, vocab_list = audio_data_utils.get_vocabulary() dict_size = len(vocab_list) @@ -64,7 +79,7 @@ def max_infer(): padding=[-1, 1000]) infer_data = test_batch_reader().next() - # run inference + # run max-ctc-decoding max_id_results = paddle.infer( output_layer=max_id, parameters=parameters, diff --git a/librispeech.py b/librispeech.py index 0d82e19f..8f82a288 100644 --- a/librispeech.py +++ b/librispeech.py @@ -1,3 +1,11 @@ +""" + Download, unpack and create manifest for Librespeech dataset. + + Manifest is a json file with each line containing one audio clip filepath, + its transcription text string, and its duration. It servers as a unified + interfance to organize different data sets. +""" + import paddle.v2 as paddle import os import wget @@ -88,9 +96,10 @@ def main(): url=URL_DEV, target_dir=os.path.join(args.target_dir), manifest_path=args.manifest + ".dev") - #prepare_dataset(url=URL_TRAIN, -#target_dir=os.path.join(args.target_dir), -#manifest_path=args.manifest + ".train") + prepare_dataset( + url=URL_TRAIN, + target_dir=os.path.join(args.target_dir), + manifest_path=args.manifest + ".train") if __name__ == '__main__': diff --git a/model.py b/model.py index 67bee5f7..de6357f4 100644 --- a/model.py +++ b/model.py @@ -1,8 +1,17 @@ +""" + A simplifed version of Baidu DeepSpeech2 model. +""" + import paddle.v2 as paddle +#TODO: add bidirectional rnn. + def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding, act): + """ + Convolution layer with batch normalization. + """ conv_layer = paddle.layer.img_conv( input=input, filter_size=filter_size, @@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, def bidirectonal_simple_rnn_bn_layer(name, input, size, act): + """ + Bidirectonal simple rnn layer with batch normalization. + The batch normalization is only performed on input-state projection + (sequence-wise normalization). + + Question: does mean and variance statistics computed over the whole sequence + or just on each individual time steps? + """ + def __simple_rnn_step__(input): last_state = paddle.layer.memory(name=name + "_state", size=size) input_fc = paddle.layer.fc( @@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act): size=size, act=paddle.activation.Linear(), bias_attr=False) + # batch norm is only performed on input-state projection input_fc_bn = paddle.layer.batch_norm( input=input_fc, act=paddle.activation.Linear()) state_fc = paddle.layer.fc( @@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act): def conv_group(input, num_stacks): + """ + Convolution group with several stacking convolution layers. + """ conv = conv_bn_layer( input=input, filter_size=(11, 41), @@ -68,6 +90,9 @@ def conv_group(input, num_stacks): def rnn_group(input, size, num_stacks): + """ + RNN group with several stacking RNN layers. + """ output = input for i in xrange(num_stacks): output = bidirectonal_simple_rnn_bn_layer( @@ -81,7 +106,27 @@ def deep_speech2(audio_data, num_conv_layers=2, num_rnn_layers=3, rnn_size=256): + """ + The whole DeepSpeech2 model structure (a simplified version). + + :param audio_data: Audio spectrogram data layer. + :type audio_data: LayerOutput + :param text_data: Transcription text data layer. + :type text_data: LayerOutput + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (number of RNN cells). + :type rnn_size: int + :return: Tuple of the cost layer and the max_id decoder layer. + :rtype: tuple of LayerOutput + """ + # convolution group conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers) + # convert data form convolution feature map to sequence of vectors conv2seq = paddle.layer.block_expand( input=conv_group_output, num_channels=32, @@ -89,18 +134,22 @@ def deep_speech2(audio_data, stride_y=1, block_x=1, block_y=21) + # rnn group rnn_group_output = rnn_group( input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + # output token distribution fc = paddle.layer.fc( input=rnn_group_output, size=dict_size + 1, act=paddle.activation.Linear(), bias_attr=True) + # ctc cost cost = paddle.layer.warp_ctc( input=fc, label=text_data, size=dict_size + 1, blank=dict_size, norm_by_times=True) + # max decoder max_id = paddle.layer.max_id(input=fc) return cost, max_id diff --git a/train.py b/train.py index 64be4033..d929297b 100644 --- a/train.py +++ b/train.py @@ -1,20 +1,27 @@ +""" + Trainer for a simplifed version of Baidu DeepSpeech2 model. +""" + import paddle.v2 as paddle -import audio_data_utils import argparse -from model import deep_speech2 import gzip +import sys +from model import deep_speech2 +import audio_data_utils + +#TODO: add WER metric parser = argparse.ArgumentParser( - description='Simpled version of DeepSpeech2 trainer.') + description='Simplified version of DeepSpeech2 trainer.') parser.add_argument( "--batch_size", default=512, type=int, help="Minibatch size.") parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument( "--num_passes", default=20, type=int, help="Training pass number.") parser.add_argument( - "--num_conv_layers", default=2, type=int, help="Convolution layer number.") + "--num_conv_layers", default=3, type=int, help="Convolution layer number.") parser.add_argument( - "--num_rnn_layers", default=3, type=int, help="RNN layer number.") + "--num_rnn_layers", default=5, type=int, help="RNN layer number.") parser.add_argument( "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") parser.add_argument( @@ -25,6 +32,9 @@ args = parser.parse_args() def train(): + """ + DeepSpeech2 training. + """ # create network config dict_size = audio_data_utils.get_vocabulary_size() audio_data = paddle.layer.data( @@ -89,8 +99,7 @@ def train(): sys.stdout.flush() if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_batch_reader, feeding=feeding) - print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost, - result.metrics) + print "Pass: %d, TestMetric: %s" % (event.pass_id, result.metrics) with gzip.open("params.tar.gz", 'w') as f: parameters.to_tar(f) From 9c3cd3c704dd079cf00c97d09d7f921c6f20344b Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Fri, 26 May 2017 17:47:24 +0800 Subject: [PATCH 05/55] Update some parameters and comments. --- train.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/train.py b/train.py index d929297b..0d7dd816 100644 --- a/train.py +++ b/train.py @@ -26,6 +26,8 @@ parser.add_argument( "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") parser.add_argument( "--use_gpu", default=True, type=bool, help="Use gpu or not.") +parser.add_argument( + "--use_sortagrad", default=False, type=bool, help="Use sortagrad or not.") parser.add_argument( "--trainer_count", default=8, type=int, help="Trainer number.") args = parser.parse_args() @@ -56,12 +58,9 @@ def train(): # create parameters and optimizer parameters = paddle.parameters.create(cost) optimizer = paddle.optimizer.Adam( - learning_rate=5e-5, - gradient_clipping_threshold=5, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + learning_rate=5e-4, gradient_clipping_threshold=400) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) - # create data readers feeding = { "audio_spectrogram": 0, @@ -70,13 +69,13 @@ def train(): train_batch_reader_with_sortagrad = audio_data_utils.padding_batch_reader( paddle.batch( audio_data_utils.reader_creator( - manifest_path="./libri.manifest.dev", sort_by_duration=True), + manifest_path="./libri.manifest.train", sort_by_duration=True), batch_size=args.batch_size // args.trainer), padding=[-1, 1000]) train_batch_reader_without_sortagrad = audio_data_utils.padding_batch_reader( paddle.batch( audio_data_utils.reader_creator( - manifest_path="./libri.manifest.dev", + manifest_path="./libri.manifest.train", sort_by_duration=False, shuffle=True), batch_size=args.batch_size // args.trainer), @@ -84,7 +83,7 @@ def train(): test_batch_reader = audio_data_utils.padding_batch_reader( paddle.batch( audio_data_utils.reader_creator( - manifest_path="./libri.manifest.test", sort_by_duration=False), + manifest_path="./libri.manifest.dev", sort_by_duration=False), batch_size=args.batch_size // args.trainer), padding=[-1, 1000]) @@ -92,27 +91,31 @@ def train(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 10 == 0: - print "Pass: %d, Batch: %d, TrainCost: %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) + print "/nPass: %d, Batch: %d, TrainCost: %f" % ( + event.pass_id, event.batch_id, event.cost) else: sys.stdout.write('.') sys.stdout.flush() if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_batch_reader, feeding=feeding) - print "Pass: %d, TestMetric: %s" % (event.pass_id, result.metrics) + print "Pass: %d, TestCost: %s" % (event.pass_id, result.cost) with gzip.open("params.tar.gz", 'w') as f: parameters.to_tar(f) # run train - trainer.train( - reader=train_batch_reader_with_sortagrad, - event_handler=event_handler, - num_passes=1, - feeding=feeding) + # first pass with sortagrad + if args.use_sortagrad: + trainer.train( + reader=train_batch_reader_with_sortagrad, + event_handler=event_handler, + num_passes=1, + feeding=feeding) + args.num_passes -= 1 + # other passes without sortagrad trainer.train( reader=train_batch_reader_without_sortagrad, event_handler=event_handler, - num_passes=self.num_passes - 1, + num_passes=args.num_passes, feeding=feeding) From e6a349992bd894663d3e9bed107a8543d478b735 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 30 May 2017 20:34:03 +0800 Subject: [PATCH 06/55] Refactor data utils into a class and add feature normalization. --- audio_data_utils.py | 512 +++++++++++++++++++++++++++++--------------- train.py | 85 ++++---- 2 files changed, 389 insertions(+), 208 deletions(-) diff --git a/audio_data_utils.py b/audio_data_utils.py index a3a397e9..7d09d612 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -1,5 +1,6 @@ """ - Audio data preprocessing tools and reader creators. + Providing basic audio data preprocessing pipeline, and offering + both instance-level and batch-level data reader interfaces. """ import paddle.v2 as paddle import logging @@ -9,143 +10,201 @@ import soundfile import numpy as np import os -# TODO: add z-score normalization. - -ENGLISH_CHAR_VOCAB_FILEPATH = "eng_vocab.txt" - +RANDOM_SEED = 0 logger = logging.getLogger(__name__) -def spectrogram_from_file(filename, - stride_ms=10, - window_ms=20, - max_freq=None, - eps=1e-14): - """ - Calculate the log of linear spectrogram from FFT energy - Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech - """ - audio, sample_rate = soundfile.read(filename) - if audio.ndim >= 2: - audio = np.mean(audio, 1) - if max_freq is None: - max_freq = sample_rate / 2 - if max_freq > sample_rate / 2: - raise ValueError("max_freq must be greater than half of " - "sample rate.") - if stride_ms > window_ms: - raise ValueError("Stride size must not be greater than window size.") - stride_size = int(0.001 * sample_rate * stride_ms) - window_size = int(0.001 * sample_rate * window_ms) - spectrogram, freqs = extract_spectrogram( - audio, - window_size=window_size, - stride_size=stride_size, - sample_rate=sample_rate) - ind = np.where(freqs <= max_freq)[0][-1] + 1 - return np.log(spectrogram[:ind, :] + eps) - - -def extract_spectrogram(samples, window_size, stride_size, sample_rate): +class DataGenerator(object): """ - Compute the spectrogram for a real discrete signal. - Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech - """ - # extract strided windows - truncate_size = (len(samples) - window_size) % stride_size - samples = samples[:len(samples) - truncate_size] - nshape = (window_size, (len(samples) - window_size) // stride_size + 1) - nstrides = (samples.strides[0], samples.strides[0] * stride_size) - windows = np.lib.stride_tricks.as_strided( - samples, shape=nshape, strides=nstrides) - assert np.all( - windows[:, 1] == samples[stride_size:(stride_size + window_size)]) - # window weighting, compute squared Fast Fourier Transform (fft), scaling - weighting = np.hanning(window_size)[:, None] - fft = np.fft.rfft(windows * weighting, axis=0) - fft = np.absolute(fft)**2 - scale = np.sum(weighting**2) * sample_rate - fft[1:-1, :] *= (2.0 / scale) - fft[(0, -1), :] /= scale - # prepare fft frequency list - freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) - return fft, freqs - - -def vocabulary_from_file(vocabulary_path): - """ - Load vocabulary from file. + DataGenerator provides basic audio data preprocessing pipeline, and offer + both instance-level and batch-level data reader interfaces. + Normalized FFT are used as audio features here. + + :param vocab_filepath: Vocabulary file path for indexing tokenized + transcriptions. + :type vocab_filepath: basestring + :param normalizer_manifest_path: Manifest filepath for collecting feature + normalization statistics, e.g. mean, std. + :type normalizer_manifest_path: basestring + :param normalizer_num_samples: Number of instances sampled for collecting + feature normalization statistics. + Default is 100. + :type normalizer_num_samples: int + :param max_duration: Audio clips with duration (in seconds) greater than + this will be discarded. Default is 20.0. + :type max_duration: float + :param min_duration: Audio clips with duration (in seconds) smaller than + this will be discarded. Default is 0.0. + :type min_duration: float + :param stride_ms: Striding size (in milliseconds) for generating frames. + Default is 10.0. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for frames. Default is 20.0. + :type window_ms: float + :param max_frequency: Maximun frequency for FFT features. FFT features of + frequency larger than this will be discarded. + If set None, all features will be kept. + Default is None. + :type max_frequency: float """ - if os.path.exists(vocabulary_path): - vocab_lines = [] - with open(vocabulary_path, 'r') as file: - vocab_lines.extend(file.readlines()) - vocab_list = [line[:-1] for line in vocab_lines] - vocab_dict = dict( - [(token, id) for (id, token) in enumerate(vocab_list)]) - return vocab_dict, vocab_list - else: - raise ValueError("Vocabulary file %s not found.", vocabulary_path) + def __init__(self, + vocab_filepath, + normalizer_manifest_path, + normalizer_num_samples=100, + max_duration=20.0, + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_frequency=None): + self.__max_duration__ = max_duration + self.__min_duration__ = min_duration + self.__stride_ms__ = stride_ms + self.__window_ms__ = window_ms + self.__max_frequency__ = max_frequency + self.__random__ = random.Random(RANDOM_SEED) + # load vocabulary (dictionary) + self.__vocab_dict__, self.__vocab_list__ = \ + self.__load_vocabulary_from_file__(vocab_filepath) + # collect normalizer statistics + self.__mean__, self.__std__ = self.__collect_normalizer_statistics__( + manifest_path=normalizer_manifest_path, + num_samples=normalizer_num_samples) -def get_vocabulary_size(): - """ - Get vocabulary size. - """ - vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) - return len(vocab_dict) + def __audio_featurize__(self, audio_filename): + """ + Preprocess audio data, including feature extraction, normalization etc.. + """ + features = self.__audio_basic_featurize__(audio_filename) + return self.__normalize__(features) + def __text_featurize__(self, text): + """ + Preprocess text data, including tokenizing and token indexing etc.. + """ + return self.__convert_text_to_char_index__( + text=text, vocabulary=self.__vocab_dict__) -def get_vocabulary(): - """ - Get vocabulary. - """ - return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) + def __audio_basic_featurize__(self, audio_filename): + """ + Compute basic (without normalization etc.) features for audio data. + """ + return self.__spectrogram_from_file__( + filename=audio_filename, + stride_ms=self.__stride_ms__, + window_ms=self.__window_ms__, + max_freq=self.__max_frequency__) + def __collect_normalizer_statistics__(self, manifest_path, num_samples=100): + """ + Compute feature normalization statistics, i.e. mean and stddev. + """ + # read manifest + manifest = self.__read_manifest__( + manifest_path=manifest_path, + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + # sample for statistics + sampled_manifest = self.__random__.sample(manifest, num_samples) + # extract spectrogram feature + features = [] + for instance in sampled_manifest: + spectrogram = self.__audio_basic_featurize__( + instance["audio_filepath"]) + features.append(spectrogram) + features = np.hstack(features) + mean = np.mean(features, axis=1).reshape([-1, 1]) + std = np.std(features, axis=1).reshape([-1, 1]) + return mean, std -def parse_transcript(text, vocabulary): - """ - Convert the transcript text string to list of token index integers. - """ - return [vocabulary[w] for w in text] + def __normalize__(self, features, eps=1e-14): + """ + Normalize features to be of zero mean and unit stddev. + """ + return (features - self.__mean__) / (self.__std__ + eps) + def __spectrogram_from_file__(self, + filename, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + eps=1e-14): + """ + Laod audio data and calculate the log of spectrogram by FFT. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + audio, sample_rate = soundfile.read(filename) + if audio.ndim >= 2: + audio = np.mean(audio, 1) + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + spectrogram, freqs = self.__extract_spectrogram__( + audio, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.log(spectrogram[:ind, :] + eps) -def reader_creator(manifest_path, - sort_by_duration=True, - shuffle=False, - max_duration=10.0, - min_duration=0.0): - """ - Audio data reader creator. - - Instance: a tuple of a numpy ndarray of audio spectrogram and a list of - tokenized transcription text. - - :param manifest_path: Filepath for Manifest of audio clip files. - :type manifest_path: basestring - :param sort_by_duration: Sort the audio clips by duration if set True. - For SortaGrad. - :type sort_by_duration: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool - :param max_duration: Audio clips with duration (in seconds) greater than - this will be discarded. - :type max_duration: float - :param min_duration: Audio clips with duration (in seconds) smaller than - this will be discarded. - :type min_duration: float - :return: Data reader function. - :rtype: callable - """ - if sort_by_duration and shuffle: - sort_by_duration = False - logger.warn("When shuffle set to true, " - "sort_by_duration is forced to set False.") - vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) + def __extract_spectrogram__(self, samples, window_size, stride_size, + sample_rate): + """ + Compute the spectrogram by FFT for a discrete real signal. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + fft = np.fft.rfft(windows * weighting, axis=0) + fft = np.absolute(fft)**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs - def reader(): - # read manifest - manifest_data = [] + def __load_vocabulary_from_file__(self, vocabulary_path): + """ + Load vocabulary from file. + """ + if not os.path.exists(vocabulary_path): + raise ValueError("Vocabulary file %s not found.", vocabulary_path) + vocab_lines = [] + with open(vocabulary_path, 'r') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list + + def __convert_text_to_char_index__(self, text, vocabulary): + """ + Convert text string to a list of character index integers. + """ + return [vocabulary[w] for w in text] + + def __read_manifest__(self, manifest_path, max_duration, min_duration): + """ + Load and parse manifest file. + """ + manifest = [] for json_line in open(manifest_path): try: json_data = json.loads(json_line) @@ -153,63 +212,172 @@ def reader_creator(manifest_path, raise ValueError("Error reading manifest: %s" % str(e)) if (json_data["duration"] <= max_duration and json_data["duration"] >= min_duration): - manifest_data.append(json_data) - # sort (by duration) or shuffle manifest - if sort_by_duration: - manifest_data.sort(key=lambda x: x["duration"]) - if shuffle: - random.shuffle(manifest_data) - # extract spectrogram feature - for instance in manifest_data: - spectrogram = spectrogram_from_file(instance["audio_filepath"]) - text = parse_transcript(instance["text"], vocab_dict) - yield (spectrogram, text) + manifest.append(json_data) + return manifest - return reader + def __padding_batch__(self, batch, padding_to=-1, flatten=False): + """ + Padding audio part of features (only in the time axis -- column axis) + with zeros, to make each instance in the batch share the same + audio feature shape. + If `padding_to` is set -1, the maximun column numbers in the batch will + be used as the target size. Otherwise, `padding_to` will be the target + size. Default is -1. -def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True): - """ - Padding for batches. Return a batch reader. - - Each instance in a batch will be padded to be of a same target shape. - The target shape is the largest shape among all the batch instances and - 'padding' argument. Therefore, if padding is set [-1, -1], instance will be - padded to have the same shape just within each batch and the shape will - be different across batches; if padding is set - [VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to - have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM]. - - :param batch_reader: Input batch reader. - :type batch_reader: callable - :param padding: Padding pattern. Details please refer to the above. - :type padding: list - :param flatten: Flatten the tensor to be one dimension. - :type flatten: bool - :return: Batch reader function. - :rtype: callable - """ - - def padding_batch(batch): + If `flatten` is set True, audio data will be flatten to be a 1-dim + ndarray. Default is False. + """ new_batch = [] - # get target shape within batch - nshape_list = [padding] - for audio, text in batch: - nshape_list.append(audio.shape) - target_shape = np.array(nshape_list).max(axis=0) + # get target shape + max_length = max([audio.shape[1] for audio, text in batch]) + if padding_to != -1: + if padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be greater" + " or equal to the original instance length.") + max_length = padding_to # padding for audio, text in batch: - pad_shape = target_shape - audio.shape - assert np.all(pad_shape >= 0) - padded_audio = np.pad( - audio, [(0, pad_shape[0]), (0, pad_shape[1])], mode="constant") + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio if flatten: padded_audio = padded_audio.flatten() new_batch.append((padded_audio, text)) return new_batch - def new_batch_reader(): - for batch in batch_reader(): - yield padding_batch(batch) + def instance_reader_creator(self, + manifest_path, + sort_by_duration=True, + shuffle=False): + """ + Instance reader creator for audio data. Creat a callable function to + produce instances of data. + + Instance: a tuple of a numpy ndarray of audio spectrogram and a list of + tokenized and indexed transcription text. + + :param manifest_path: Filepath of manifest for audio clip files. + :type manifest_path: basestring + :param sort_by_duration: Sort the audio clips by duration if set True + (for SortaGrad). + :type sort_by_duration: bool + :param shuffle: Shuffle the audio clips if set True. + :type shuffle: bool + :return: Data reader function. + :rtype: callable + """ + if sort_by_duration and shuffle: + sort_by_duration = False + logger.warn("When shuffle set to true, " + "sort_by_duration is forced to set False.") + + def reader(): + # read manifest + manifest = self.__read_manifest__( + manifest_path=manifest_path, + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + # sort (by duration) or shuffle manifest + if sort_by_duration: + manifest.sort(key=lambda x: x["duration"]) + if shuffle: + self.__random__.shuffle(manifest) + # extract spectrogram feature + for instance in manifest: + spectrogram = self.__audio_featurize__( + instance["audio_filepath"]) + transcript = self.__text_featurize__(instance["text"]) + yield (spectrogram, transcript) + + return reader + + def batch_reader_creator(self, + manifest_path, + batch_size, + padding_to=-1, + flatten=False, + sort_by_duration=True, + shuffle=False): + """ + Batch data reader creator for audio data. Creat a callable function to + produce batches of data. + + Audio features will be padded with zeros to make each instance in the + batch to share the same audio feature shape. + + :param manifest_path: Filepath of manifest for audio clip files. + :type manifest_path: basestring + :param batch_size: Instance number in a batch. + :type batch_size: int + :param padding_to: If set -1, the maximun column numbers in the batch + will be used as the target size for padding. + Otherwise, `padding_to` will be the target size. + Default is -1. + :type padding_to: int + :param flatten: If set True, audio data will be flatten to be a 1-dim + ndarray. Otherwise, 2-dim ndarray. Default is False. + :type flatten: bool + :param sort_by_duration: Sort the audio clips by duration if set True + (for SortaGrad). + :type sort_by_duration: bool + :param shuffle: Shuffle the audio clips if set True. + :type shuffle: bool + :return: Batch reader function, producing batches of data when called. + :rtype: callable + """ + + def batch_reader(): + instance_reader = self.instance_reader_creator( + manifest_path=manifest_path, + sort_by_duration=sort_by_duration, + shuffle=shuffle) + batch = [] + for instance in instance_reader(): + batch.append(instance) + if len(batch) == batch_size: + yield self.__padding_batch__(batch, padding_to, flatten) + batch = [] + if len(batch) > 0: + yield self.__padding_batch__(batch, padding_to, flatten) + + return batch_reader + + def vocabulary_size(self): + """ + Get vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return len(self.__vocab_list__) + + def vocabulary_dict(self): + """ + Get vocabulary in dict. + + :return: Vocabulary in dict. + :rtype: dict + """ + return self.__vocab_dict__ + + def vocabulary_list(self): + """ + Get vocabulary in list. + + :return: Vocabulary in list + :rtype: list + """ + return self.__vocab_list__ + + def data_name_feeding(self): + """ + Get feeddings (data field name and corresponding field id). - return new_batch_reader + :return: Feeding dict. + :rtype: dict + """ + feeding = { + "audio_spectrogram": 0, + "transcript_text": 1, + } + return feeding diff --git a/train.py b/train.py index 0d7dd816..89dcf35c 100644 --- a/train.py +++ b/train.py @@ -5,16 +5,18 @@ import paddle.v2 as paddle import argparse import gzip +import time import sys from model import deep_speech2 -import audio_data_utils +from audio_data_utils import DataGenerator +import numpy as np #TODO: add WER metric parser = argparse.ArgumentParser( description='Simplified version of DeepSpeech2 trainer.') parser.add_argument( - "--batch_size", default=512, type=int, help="Minibatch size.") + "--batch_size", default=32, type=int, help="Minibatch size.") parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument( "--num_passes", default=20, type=int, help="Training pass number.") @@ -23,7 +25,7 @@ parser.add_argument( parser.add_argument( "--num_rnn_layers", default=5, type=int, help="RNN layer number.") parser.add_argument( - "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") + "--rnn_layer_size", default=512, type=int, help="RNN layer cell number.") parser.add_argument( "--use_gpu", default=True, type=bool, help="Use gpu or not.") parser.add_argument( @@ -37,13 +39,45 @@ def train(): """ DeepSpeech2 training. """ + # create data readers + data_generator = DataGenerator( + vocab_filepath='eng_vocab.txt', + normalizer_manifest_path='./libri.manifest.train', + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) + train_batch_reader_sortagrad = data_generator.batch_reader_creator( + manifest_path='./libri.manifest.dev.small', + batch_size=args.batch_size // args.trainer, + padding_to=2000, + flatten=True, + sort_by_duration=True, + shuffle=False) + train_batch_reader_nosortagrad = data_generator.batch_reader_creator( + manifest_path='./libri.manifest.dev.small', + batch_size=args.batch_size // args.trainer, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=True) + test_batch_reader = data_generator.batch_reader_creator( + manifest_path='./libri.manifest.test', + batch_size=args.batch_size // args.trainer, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=False) + feeding = data_generator.data_name_feeding() + # create network config - dict_size = audio_data_utils.get_vocabulary_size() + dict_size = data_generator.vocabulary_size() audio_data = paddle.layer.data( name="audio_spectrogram", height=161, - width=1000, - type=paddle.data_type.dense_vector(161000)) + width=2000, + type=paddle.data_type.dense_vector(322000)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) @@ -58,47 +92,26 @@ def train(): # create parameters and optimizer parameters = paddle.parameters.create(cost) optimizer = paddle.optimizer.Adam( - learning_rate=5e-4, gradient_clipping_threshold=400) + learning_rate=5e-5, gradient_clipping_threshold=400) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) - # create data readers - feeding = { - "audio_spectrogram": 0, - "transcript_text": 1, - } - train_batch_reader_with_sortagrad = audio_data_utils.padding_batch_reader( - paddle.batch( - audio_data_utils.reader_creator( - manifest_path="./libri.manifest.train", sort_by_duration=True), - batch_size=args.batch_size // args.trainer), - padding=[-1, 1000]) - train_batch_reader_without_sortagrad = audio_data_utils.padding_batch_reader( - paddle.batch( - audio_data_utils.reader_creator( - manifest_path="./libri.manifest.train", - sort_by_duration=False, - shuffle=True), - batch_size=args.batch_size // args.trainer), - padding=[-1, 1000]) - test_batch_reader = audio_data_utils.padding_batch_reader( - paddle.batch( - audio_data_utils.reader_creator( - manifest_path="./libri.manifest.dev", sort_by_duration=False), - batch_size=args.batch_size // args.trainer), - padding=[-1, 1000]) # create event handler def event_handler(event): + global start_time if isinstance(event, paddle.event.EndIteration): if event.batch_id % 10 == 0: - print "/nPass: %d, Batch: %d, TrainCost: %f" % ( + print "\nPass: %d, Batch: %d, TrainCost: %f" % ( event.pass_id, event.batch_id, event.cost) else: sys.stdout.write('.') sys.stdout.flush() + if isinstance(event, paddle.event.BeginPass): + start_time = time.time() if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_batch_reader, feeding=feeding) - print "Pass: %d, TestCost: %s" % (event.pass_id, result.cost) + print "\n------- Time: %d, Pass: %d, TestCost: %s" % ( + time.time() - start_time, event.pass_id, result.cost) with gzip.open("params.tar.gz", 'w') as f: parameters.to_tar(f) @@ -106,14 +119,14 @@ def train(): # first pass with sortagrad if args.use_sortagrad: trainer.train( - reader=train_batch_reader_with_sortagrad, + reader=train_batch_reader_sortagrad, event_handler=event_handler, num_passes=1, feeding=feeding) args.num_passes -= 1 # other passes without sortagrad trainer.train( - reader=train_batch_reader_without_sortagrad, + reader=train_batch_reader_nosortagrad, event_handler=event_handler, num_passes=args.num_passes, feeding=feeding) From 8313895e858ad7da2f45e373446dd0c11e923431 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Fri, 2 Jun 2017 21:07:58 +0800 Subject: [PATCH 07/55] 1. Fix incorrect decoder result printing. 2. Fix incorrect batch-norm usage in RNN. 3. Fix overlapping train/dev/test manfests. 4. Update README.md and requirements.txt. 5. Expose more arguments to users in argparser. 6. Update all other details. --- README.md | 55 +++++++++++++++++-- audio_data_utils.py | 2 +- infer.py | 83 ++++++++++++++++++++--------- librispeech.py | 80 +++++++++++++++++++--------- model.py | 65 ++++++++--------------- requirements.sh | 5 -- requirements.txt | 2 + train.py | 125 ++++++++++++++++++++++++++++++-------------- 8 files changed, 280 insertions(+), 137 deletions(-) delete mode 100644 requirements.sh create mode 100644 requirements.txt diff --git a/README.md b/README.md index 1f7e0384..48ee9f9a 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,58 @@ # Deep Speech 2 on PaddlePaddle +## Quick Start + +### Installation + +Please replace `$PADDLE_INSTALL_DIR` with your paddle installation directory. + +``` +pip install -r requirements.txt +export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH +``` + +For some machines, we also need to install libsndfile1. Details to be added. + +### Preparing Dataset(s) + ``` -sh requirements.sh python librispeech.py -python train.py ``` -Please add warp-ctc library path (usually $PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib) to LD_LIBRARY_PATH. +More help for arguments: + +``` +python librispeech.py --help +``` + +### Traininig + +For GPU Training: + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +``` + +For CPU Training: + +``` +python train.py --trainer_count 8 --use_gpu False +``` + +More help for arguments: + +``` +python train.py --help +``` + +### Inferencing + +``` +python infer.py +``` + +More help for arguments: + +``` +python infer.py --help +``` diff --git a/audio_data_utils.py b/audio_data_utils.py index 7d09d612..c717bcf1 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) class DataGenerator(object): """ - DataGenerator provides basic audio data preprocessing pipeline, and offer + DataGenerator provides basic audio data preprocessing pipeline, and offers both instance-level and batch-level data reader interfaces. Normalized FFT are used as audio features here. diff --git a/infer.py b/infer.py index 1f13956e..1c52c98f 100644 --- a/infer.py +++ b/infer.py @@ -4,9 +4,10 @@ import paddle.v2 as paddle from itertools import groupby +import distutils.util import argparse import gzip -import audio_data_utils +from audio_data_utils import DataGenerator from model import deep_speech2 parser = argparse.ArgumentParser( @@ -15,15 +16,42 @@ parser.add_argument( "--num_samples", default=10, type=int, - help="Number of samples for inference.") + help="Number of samples for inference. (default: %(default)s)") parser.add_argument( - "--num_conv_layers", default=2, type=int, help="Convolution layer number.") + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") +parser.add_argument( + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") +parser.add_argument( + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") parser.add_argument( - "--num_rnn_layers", default=3, type=int, help="RNN layer number.") + "--normalizer_manifest_path", + default='./manifest.libri.train-clean-100', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( - "--rnn_layer_size", default=512, type=int, help="RNN layer cell number.") + "--decode_manifest_path", + default='./manifest.libri.test-clean', + type=str, + help="Manifest path for decoding. (default: %(default)s)") parser.add_argument( - "--use_gpu", default=True, type=bool, help="Use gpu or not.") + "--model_filepath", + default='./params.tar.gz', + type=str, + help="Model filepath. (default: %(default)s)") args = parser.parse_args() @@ -39,18 +67,27 @@ def remove_duplicate_and_blank(id_list, blank_id): return [id for id in id_list if id != blank_id] -def max_infer(): +def best_path_decode(): """ Max-ctc-decoding for DeepSpeech2. """ + # initialize data generator + data_generator = DataGenerator( + vocab_filepath='eng_vocab.txt', + normalizer_manifest_path=args.normalizer_manifest_path, + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) # create network config - _, vocab_list = audio_data_utils.get_vocabulary() - dict_size = len(vocab_list) + dict_size = data_generator.vocabulary_size() + vocab_list = data_generator.vocabulary_list() audio_data = paddle.layer.data( name="audio_spectrogram", height=161, - width=1000, - type=paddle.data_type.dense_vector(161000)) + width=2000, + type=paddle.data_type.dense_vector(322000)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) @@ -64,19 +101,17 @@ def max_infer(): # load parameters parameters = paddle.parameters.Parameters.from_tar( - gzip.open("params.tar.gz")) + gzip.open(args.model_filepath)) # prepare infer data - feeding = { - "audio_spectrogram": 0, - "transcript_text": 1, - } - test_batch_reader = audio_data_utils.padding_batch_reader( - paddle.batch( - audio_data_utils.reader_creator( - manifest_path="./libri.manifest.test", sort_by_duration=False), - batch_size=args.num_samples), - padding=[-1, 1000]) + feeding = data_generator.data_name_feeding() + test_batch_reader = data_generator.batch_reader_creator( + manifest_path=args.decode_manifest_path, + batch_size=args.num_samples, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=False) infer_data = test_batch_reader().next() # run max-ctc-decoding @@ -89,7 +124,7 @@ def max_infer(): # postprocess instance_length = len(max_id_results) / args.num_samples instance_list = [ - max_id_results[i:i + instance_length] + max_id_results[i * instance_length:(i + 1) * instance_length] for i in xrange(0, args.num_samples) ] for i, instance in enumerate(instance_list): @@ -102,7 +137,7 @@ def max_infer(): def main(): paddle.init(use_gpu=args.use_gpu, trainer_count=1) - max_infer() + best_path_decode() if __name__ == '__main__': diff --git a/librispeech.py b/librispeech.py index 8f82a288..676bbec5 100644 --- a/librispeech.py +++ b/librispeech.py @@ -7,6 +7,7 @@ """ import paddle.v2 as paddle +from paddle.v2.dataset.common import md5file import os import wget import tarfile @@ -14,11 +15,22 @@ import argparse import soundfile import json -DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') +DATA_HOME = os.path.expanduser('~/.cache2/paddle/dataset/speech') -URL_TEST = "http://www.openslr.org/resources/12/test-clean.tar.gz" -URL_DEV = "http://www.openslr.org/resources/12/dev-clean.tar.gz" -URL_TRAIN = "http://www.openslr.org/resources/12/train-clean-100.tar.gz" +URL_ROOT = "http://www.openslr.org/resources/12" +URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" +URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" +URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz" +URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz" +URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" +URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" + +MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" +MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" +MD5_TRAIN_CLEAN_500 = "d1a0fd59409feb2c614ce4d30c387708" parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') @@ -26,27 +38,33 @@ parser.add_argument( "--target_dir", default=DATA_HOME + "/Libri", type=str, - help="Directory to save the dataset.") + help="Directory to save the dataset. (default: %(default)s)") parser.add_argument( - "--manifest", - default="./libri.manifest", + "--manifest_prefix", + default="manifest.libri", type=str, - help="Filepath prefix for output manifests.") + help="Filepath prefix for output manifests. (default: %(default)s)") args = parser.parse_args() -def download(url, target_dir): - if not os.path.exists(target_dir): - os.makedirs(target_dir) +def download(url, md5sum, target_dir): + """ + Download file from url to target_dir, and check md5sum. + """ + if not os.path.exists(target_dir): os.makedirs(target_dir) filepath = os.path.join(target_dir, url.split("/")[-1]) - if not os.path.exists(filepath): + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): print("Downloading %s ..." % url) wget.download(url, target_dir) - print("") + print("\nMD5 Chesksum %s ..." % filepath) + assert md5file(filepath) == md5sum, "MD5 checksum failed." return filepath def unpack(filepath, target_dir): + """ + Unpack the file to the target_dir. + """ print("Unpacking %s ..." % filepath) tar = tarfile.open(filepath) tar.extractall(target_dir) @@ -55,6 +73,14 @@ def unpack(filepath, target_dir): def create_manifest(data_dir, manifest_path): + """ + Create a manifest file summarizing the dataset (list of filepath and meta + data). + + Each line of the manifest contains one audio clip filepath, its + transcription text string, and its duration. Manifest file servers as a + unified interfance to organize data sets. + """ print("Creating manifest %s ..." % manifest_path) json_lines = [] for subfolder, _, filelist in os.walk(data_dir): @@ -81,25 +107,31 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def prepare_dataset(url, target_dir, manifest_path): - filepath = download(url, target_dir) +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """ + Download, unpack and create summmary manifest file. + """ + filepath = download(url, md5sum, target_dir) unpacked_dir = unpack(filepath, target_dir) create_manifest(unpacked_dir, manifest_path) def main(): prepare_dataset( - url=URL_TEST, - target_dir=os.path.join(args.target_dir), - manifest_path=args.manifest + ".test") + url=URL_TEST_CLEAN, + md5sum=MD5_TEST_CLEAN, + target_dir=os.path.join(args.target_dir, "test-clean"), + manifest_path=args.manifest_prefix + ".test-clean") prepare_dataset( - url=URL_DEV, - target_dir=os.path.join(args.target_dir), - manifest_path=args.manifest + ".dev") + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") prepare_dataset( - url=URL_TRAIN, - target_dir=os.path.join(args.target_dir), - manifest_path=args.manifest + ".train") + url=URL_TRAIN_CLEAN_100, + md5sum=MD5_TRAIN_CLEAN_100, + target_dir=os.path.join(args.target_dir, "train-clean-100"), + manifest_path=args.manifest_prefix + ".train-clean-100") if __name__ == '__main__': diff --git a/model.py b/model.py index de6357f4..6b396900 100644 --- a/model.py +++ b/model.py @@ -24,45 +24,23 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, return paddle.layer.batch_norm(input=conv_layer, act=act) -def bidirectonal_simple_rnn_bn_layer(name, input, size, act): +def bidirectional_simple_rnn_bn_layer(name, input, size, act): """ - Bidirectonal simple rnn layer with batch normalization. - The batch normalization is only performed on input-state projection - (sequence-wise normalization). - - Question: does mean and variance statistics computed over the whole sequence - or just on each individual time steps? + Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. """ - - def __simple_rnn_step__(input): - last_state = paddle.layer.memory(name=name + "_state", size=size) - input_fc = paddle.layer.fc( - input=input, - size=size, - act=paddle.activation.Linear(), - bias_attr=False) - # batch norm is only performed on input-state projection - input_fc_bn = paddle.layer.batch_norm( - input=input_fc, act=paddle.activation.Linear()) - state_fc = paddle.layer.fc( - input=last_state, - size=size, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.addto( - name=name + "_state", input=[input_fc_bn, state_fc], act=act) - - forward = paddle.layer.recurrent_group( - step=__simple_rnn_step__, input=input) - return forward - # argument reverse is not exposed in V2 recurrent_group - #backward = paddle.layer.recurrent_group( - - -#step=__simple_rnn_step__, -#input=input, -#reverse=True) -#return paddle.layer.concat(input=[forward, backward]) + # input-hidden weights shared across bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) def conv_group(input, num_stacks): @@ -86,7 +64,9 @@ def conv_group(input, num_stacks): stride=(1, 2), padding=(5, 10), act=paddle.activation.BRelu()) - return conv + output_num_channels = 32 + output_height = 160 // pow(2, num_stacks) + 1 + return conv, output_num_channels, output_height def rnn_group(input, size, num_stacks): @@ -95,7 +75,7 @@ def rnn_group(input, size, num_stacks): """ output = input for i in xrange(num_stacks): - output = bidirectonal_simple_rnn_bn_layer( + output = bidirectional_simple_rnn_bn_layer( name=str(i), input=output, size=size, act=paddle.activation.BRelu()) return output @@ -125,15 +105,16 @@ def deep_speech2(audio_data, :rtype: tuple of LayerOutput """ # convolution group - conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers) + conv_group_output, conv_group_num_channels, conv_group_height = conv_group( + input=audio_data, num_stacks=num_conv_layers) # convert data form convolution feature map to sequence of vectors conv2seq = paddle.layer.block_expand( input=conv_group_output, - num_channels=32, + num_channels=conv_group_num_channels, stride_x=1, stride_y=1, block_x=1, - block_y=21) + block_y=conv_group_height) # rnn group rnn_group_output = rnn_group( input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) diff --git a/requirements.sh b/requirements.sh deleted file mode 100644 index bb1f261d..00000000 --- a/requirements.sh +++ /dev/null @@ -1,5 +0,0 @@ -pip install wget -pip install soundfile - -# For Ubuntu only -apt-get install libsndfile1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..58a93deb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +SoundFile==0.9.0.post1 +wget==3.2 diff --git a/train.py b/train.py index 89dcf35c..ad6e5ffd 100644 --- a/train.py +++ b/train.py @@ -3,6 +3,7 @@ """ import paddle.v2 as paddle +import distutils.util import argparse import gzip import time @@ -17,21 +18,61 @@ parser = argparse.ArgumentParser( description='Simplified version of DeepSpeech2 trainer.') parser.add_argument( "--batch_size", default=32, type=int, help="Minibatch size.") -parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument( - "--num_passes", default=20, type=int, help="Training pass number.") + "--num_passes", + default=20, + type=int, + help="Training pass number. (default: %(default)s)") parser.add_argument( - "--num_conv_layers", default=3, type=int, help="Convolution layer number.") + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") parser.add_argument( - "--num_rnn_layers", default=5, type=int, help="RNN layer number.") + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") parser.add_argument( - "--rnn_layer_size", default=512, type=int, help="RNN layer cell number.") + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") parser.add_argument( - "--use_gpu", default=True, type=bool, help="Use gpu or not.") + "--adam_learning_rate", + default=5e-4, + type=float, + help="Learning rate for ADAM Optimizer. (default: %(default)s)") parser.add_argument( - "--use_sortagrad", default=False, type=bool, help="Use sortagrad or not.") + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") parser.add_argument( - "--trainer_count", default=8, type=int, help="Trainer number.") + "--use_sortagrad", + default=False, + type=distutils.util.strtobool, + help="Use sortagrad or not. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=4, + type=int, + help="Trainer number. (default: %(default)s)") +parser.add_argument( + "--normalizer_manifest_path", + default='./manifest.libri.train-clean-100', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--train_manifest_path", + default='./manifest.libri.train-clean-100', + type=str, + help="Manifest path for training. (default: %(default)s)") +parser.add_argument( + "--dev_manifest_path", + default='./manifest.libri.dev-clean', + type=str, + help="Manifest path for validation. (default: %(default)s)") args = parser.parse_args() @@ -39,37 +80,15 @@ def train(): """ DeepSpeech2 training. """ - # create data readers + # initialize data generator data_generator = DataGenerator( vocab_filepath='eng_vocab.txt', - normalizer_manifest_path='./libri.manifest.train', + normalizer_manifest_path=args.normalizer_manifest_path, normalizer_num_samples=200, max_duration=20.0, min_duration=0.0, stride_ms=10, window_ms=20) - train_batch_reader_sortagrad = data_generator.batch_reader_creator( - manifest_path='./libri.manifest.dev.small', - batch_size=args.batch_size // args.trainer, - padding_to=2000, - flatten=True, - sort_by_duration=True, - shuffle=False) - train_batch_reader_nosortagrad = data_generator.batch_reader_creator( - manifest_path='./libri.manifest.dev.small', - batch_size=args.batch_size // args.trainer, - padding_to=2000, - flatten=True, - sort_by_duration=False, - shuffle=True) - test_batch_reader = data_generator.batch_reader_creator( - manifest_path='./libri.manifest.test', - batch_size=args.batch_size // args.trainer, - padding_to=2000, - flatten=True, - sort_by_duration=False, - shuffle=False) - feeding = data_generator.data_name_feeding() # create network config dict_size = data_generator.vocabulary_size() @@ -92,28 +111,58 @@ def train(): # create parameters and optimizer parameters = paddle.parameters.create(cost) optimizer = paddle.optimizer.Adam( - learning_rate=5e-5, gradient_clipping_threshold=400) + learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # prepare data reader + train_batch_reader_sortagrad = data_generator.batch_reader_creator( + manifest_path=args.train_manifest_path, + batch_size=args.batch_size // args.trainer_count, + padding_to=2000, + flatten=True, + sort_by_duration=True, + shuffle=False) + train_batch_reader_nosortagrad = data_generator.batch_reader_creator( + manifest_path=args.train_manifest_path, + batch_size=args.batch_size // args.trainer_count, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=True) + test_batch_reader = data_generator.batch_reader_creator( + manifest_path=args.dev_manifest_path, + batch_size=args.batch_size // args.trainer_count, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=False) + feeding = data_generator.data_name_feeding() + # create event handler def event_handler(event): global start_time + global cost_sum + global cost_counter if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 10 == 0: + cost_sum += event.cost + cost_counter += 1 + if event.batch_id % 50 == 0: print "\nPass: %d, Batch: %d, TrainCost: %f" % ( - event.pass_id, event.batch_id, event.cost) + event.pass_id, event.batch_id, cost_sum / cost_counter) + cost_sum, cost_counter = 0.0, 0 + with gzip.open("params.tar.gz", 'w') as f: + parameters.to_tar(f) else: sys.stdout.write('.') sys.stdout.flush() if isinstance(event, paddle.event.BeginPass): start_time = time.time() + cost_sum, cost_counter = 0.0, 0 if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_batch_reader, feeding=feeding) - print "\n------- Time: %d, Pass: %d, TestCost: %s" % ( + print "\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % ( time.time() - start_time, event.pass_id, result.cost) - with gzip.open("params.tar.gz", 'w') as f: - parameters.to_tar(f) # run train # first pass with sortagrad From 2a834865009ff52524a70a97f13d7d2ec78a61c9 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Sat, 3 Jun 2017 14:52:02 +0800 Subject: [PATCH 08/55] Refactor decoder interfaces and add ./data directory. --- README.md | 2 + eng_vocab.txt => data/eng_vocab.txt | 0 librispeech.py => data/librispeech.py | 0 decoder.py | 60 +++++++++++++++++++++++ infer.py | 69 ++++++++++++--------------- model.py | 34 ++++++++----- train.py | 28 ++++++----- 7 files changed, 130 insertions(+), 63 deletions(-) rename eng_vocab.txt => data/eng_vocab.txt (100%) rename librispeech.py => data/librispeech.py (100%) create mode 100755 decoder.py diff --git a/README.md b/README.md index 48ee9f9a..b20c75f9 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ For some machines, we also need to install libsndfile1. Details to be added. ### Preparing Dataset(s) ``` +cd data python librispeech.py +cd .. ``` More help for arguments: diff --git a/eng_vocab.txt b/data/eng_vocab.txt similarity index 100% rename from eng_vocab.txt rename to data/eng_vocab.txt diff --git a/librispeech.py b/data/librispeech.py similarity index 100% rename from librispeech.py rename to data/librispeech.py diff --git a/decoder.py b/decoder.py new file mode 100755 index 00000000..7c4b9526 --- /dev/null +++ b/decoder.py @@ -0,0 +1,60 @@ +""" + CTC-like decoder utilitis. +""" + +from itertools import groupby +import numpy as np + + +def ctc_best_path_decode(probs_seq, vocabulary): + """ + Best path decoding, also called argmax decoding or greedy decoding. + Path consisting of the most probable tokens are further post-processed to + remove consecutive repetitions and all blanks. + + :param probs_seq: 2-D list of probabilities over the vocabulary for each + character. Each element is a list of float probabilities + for one character. + :type probs_seq: list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :return: Decoding result string. + :rtype: baseline + """ + # dimension verification + for probs in probs_seq: + if not len(probs) == len(vocabulary) + 1: + raise ValueError("probs_seq dimension mismatchedd with vocabulary") + # argmax to get the best index for each time step + max_index_list = list(np.array(probs_seq).argmax(axis=1)) + # remove consecutive duplicate indexes + index_list = [index_group[0] for index_group in groupby(max_index_list)] + # remove blank indexes + blank_index = len(vocabulary) + index_list = [index for index in index_list if index != blank_index] + # convert index list to string + return ''.join([vocabulary[index] for index in index_list]) + + +def ctc_decode(probs_seq, vocabulary, method): + """ + CTC-like sequence decoding from a sequence of likelihood probablilites. + + :param probs_seq: 2-D list of probabilities over the vocabulary for each + character. Each element is a list of float probabilities + for one character. + :type probs_seq: list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param method: Decoding method name, with options: "best_path". + :type method: basestring + :return: Decoding result string. + :rtype: baseline + """ + for prob_list in probs_seq: + if not len(prob_list) == len(vocabulary) + 1: + raise ValueError("probs dimension mismatchedd with vocabulary") + if method == "best_path": + return ctc_best_path_decode(probs_seq, vocabulary) + else: + raise ValueError("Decoding method [%s] is not supported.") diff --git a/infer.py b/infer.py index 1c52c98f..598c348b 100644 --- a/infer.py +++ b/infer.py @@ -3,12 +3,12 @@ """ import paddle.v2 as paddle -from itertools import groupby import distutils.util import argparse import gzip from audio_data_utils import DataGenerator from model import deep_speech2 +from decoder import ctc_decode parser = argparse.ArgumentParser( description='Simplified version of DeepSpeech2 inference.') @@ -39,12 +39,12 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--normalizer_manifest_path", - default='./manifest.libri.train-clean-100', + default='data/manifest.libri.train-clean-100', type=str, help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--decode_manifest_path", - default='./manifest.libri.test-clean', + default='data/manifest.libri.test-clean', type=str, help="Manifest path for decoding. (default: %(default)s)") parser.add_argument( @@ -52,34 +52,28 @@ parser.add_argument( default='./params.tar.gz', type=str, help="Model filepath. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='data/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") args = parser.parse_args() -def remove_duplicate_and_blank(id_list, blank_id): - """ - Postprocessing for max-ctc-decoder. - - remove consecutive duplicate tokens. - - remove blanks. - """ - # remove consecutive duplicate tokens - id_list = [x[0] for x in groupby(id_list)] - # remove blanks - return [id for id in id_list if id != blank_id] - - -def best_path_decode(): +def infer(): """ Max-ctc-decoding for DeepSpeech2. """ # initialize data generator data_generator = DataGenerator( - vocab_filepath='eng_vocab.txt', + vocab_filepath=args.vocab_filepath, normalizer_manifest_path=args.normalizer_manifest_path, normalizer_num_samples=200, max_duration=20.0, min_duration=0.0, stride_ms=10, window_ms=20) + # create network config dict_size = data_generator.vocabulary_size() vocab_list = data_generator.vocabulary_list() @@ -91,13 +85,14 @@ def best_path_decode(): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) - _, max_id = deep_speech2( + output_probs = deep_speech2( audio_data=audio_data, text_data=text_data, dict_size=dict_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size) + rnn_size=args.rnn_layer_size, + is_inference=True) # load parameters parameters = paddle.parameters.Parameters.from_tar( @@ -114,30 +109,28 @@ def best_path_decode(): shuffle=False) infer_data = test_batch_reader().next() - # run max-ctc-decoding - max_id_results = paddle.infer( - output_layer=max_id, - parameters=parameters, - input=infer_data, - field=['id']) - - # postprocess - instance_length = len(max_id_results) / args.num_samples - instance_list = [ - max_id_results[i * instance_length:(i + 1) * instance_length] - for i in xrange(0, args.num_samples) + # run inference + infer_results = paddle.infer( + output_layer=output_probs, parameters=parameters, input=infer_data) + num_steps = len(infer_results) / len(infer_data) + probs_split = [ + infer_results[i * num_steps:(i + 1) * num_steps] + for i in xrange(0, len(infer_data)) ] - for i, instance in enumerate(instance_list): - id_list = remove_duplicate_and_blank(instance, dict_size) - output_transcript = ''.join([vocab_list[id] for id in id_list]) - target_transcript = ''.join([vocab_list[id] for id in infer_data[i][1]]) - print("Target Transcript: %s \nOutput Transcript: %s \n" % - (target_transcript, output_transcript)) + + # decode and print + for i, probs in enumerate(probs_split): + output_transcription = ctc_decode( + probs_seq=probs, vocabulary=vocab_list, method="best_path") + target_transcription = ''.join( + [vocab_list[index] for index in infer_data[i][1]]) + print("Target Transcription: %s \nOutput Transcription: %s \n" % + (target_transcription, output_transcription)) def main(): paddle.init(use_gpu=args.use_gpu, trainer_count=1) - best_path_decode() + infer() if __name__ == '__main__': diff --git a/model.py b/model.py index 6b396900..13ff829b 100644 --- a/model.py +++ b/model.py @@ -85,7 +85,8 @@ def deep_speech2(audio_data, dict_size, num_conv_layers=2, num_rnn_layers=3, - rnn_size=256): + rnn_size=256, + is_inference=False): """ The whole DeepSpeech2 model structure (a simplified version). @@ -101,7 +102,12 @@ def deep_speech2(audio_data, :type num_rnn_layers: int :param rnn_size: RNN layer size (number of RNN cells). :type rnn_size: int - :return: Tuple of the cost layer and the max_id decoder layer. + :param is_inference: False in the training mode, and True in the + inferene mode. + :type is_inference: bool + :return: If is_inference set False, return a ctc cost layer; + if is_inference set True, return a sequence layer of output + probability distribution. :rtype: tuple of LayerOutput """ # convolution group @@ -118,19 +124,21 @@ def deep_speech2(audio_data, # rnn group rnn_group_output = rnn_group( input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) - # output token distribution fc = paddle.layer.fc( input=rnn_group_output, size=dict_size + 1, act=paddle.activation.Linear(), bias_attr=True) - # ctc cost - cost = paddle.layer.warp_ctc( - input=fc, - label=text_data, - size=dict_size + 1, - blank=dict_size, - norm_by_times=True) - # max decoder - max_id = paddle.layer.max_id(input=fc) - return cost, max_id + if is_inference: + # probability distribution with softmax + return paddle.layer.mixed( + input=paddle.layer.identity_projection(input=fc), + act=paddle.activation.Softmax()) + else: + # ctc cost + return paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) diff --git a/train.py b/train.py index ad6e5ffd..e6a7d076 100644 --- a/train.py +++ b/train.py @@ -60,19 +60,24 @@ parser.add_argument( help="Trainer number. (default: %(default)s)") parser.add_argument( "--normalizer_manifest_path", - default='./manifest.libri.train-clean-100', + default='data/manifest.libri.train-clean-100', type=str, help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--train_manifest_path", - default='./manifest.libri.train-clean-100', + default='data/manifest.libri.train-clean-100', type=str, help="Manifest path for training. (default: %(default)s)") parser.add_argument( "--dev_manifest_path", - default='./manifest.libri.dev-clean', + default='data/manifest.libri.dev-clean', type=str, help="Manifest path for validation. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='data/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") args = parser.parse_args() @@ -82,7 +87,7 @@ def train(): """ # initialize data generator data_generator = DataGenerator( - vocab_filepath='eng_vocab.txt', + vocab_filepath=args.vocab_filepath, normalizer_manifest_path=args.normalizer_manifest_path, normalizer_num_samples=200, max_duration=20.0, @@ -100,13 +105,14 @@ def train(): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) - cost, _ = deep_speech2( + cost = deep_speech2( audio_data=audio_data, text_data=text_data, dict_size=dict_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size) + rnn_size=args.rnn_layer_size, + is_inference=False) # create parameters and optimizer parameters = paddle.parameters.create(cost) @@ -118,21 +124,21 @@ def train(): # prepare data reader train_batch_reader_sortagrad = data_generator.batch_reader_creator( manifest_path=args.train_manifest_path, - batch_size=args.batch_size // args.trainer_count, + batch_size=args.batch_size, padding_to=2000, flatten=True, sort_by_duration=True, shuffle=False) train_batch_reader_nosortagrad = data_generator.batch_reader_creator( manifest_path=args.train_manifest_path, - batch_size=args.batch_size // args.trainer_count, + batch_size=args.batch_size, padding_to=2000, flatten=True, sort_by_duration=False, shuffle=True) test_batch_reader = data_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, - batch_size=args.batch_size // args.trainer_count, + batch_size=args.batch_size, padding_to=2000, flatten=True, sort_by_duration=False, @@ -141,9 +147,7 @@ def train(): # create event handler def event_handler(event): - global start_time - global cost_sum - global cost_counter + global start_time, cost_sum, cost_counter if isinstance(event, paddle.event.EndIteration): cost_sum += event.cost cost_counter += 1 From 730d5c4dd3fa3583202189636ff852f78a76b5da Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Sat, 3 Jun 2017 15:18:33 +0800 Subject: [PATCH 09/55] Update DS2 README.md and fix bug in librispeech.py --- README.md | 10 +++++----- data/librispeech.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index b20c75f9..bb1815c0 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ # Deep Speech 2 on PaddlePaddle -## Quick Start +## Installation -### Installation - -Please replace `$PADDLE_INSTALL_DIR` with your paddle installation directory. +Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. ``` pip install -r requirements.txt @@ -13,7 +11,9 @@ export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/li For some machines, we also need to install libsndfile1. Details to be added. -### Preparing Dataset(s) +## Usage + +### Preparing Data ``` cd data diff --git a/data/librispeech.py b/data/librispeech.py index 676bbec5..838fee59 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -15,7 +15,7 @@ import argparse import soundfile import json -DATA_HOME = os.path.expanduser('~/.cache2/paddle/dataset/speech') +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') URL_ROOT = "http://www.openslr.org/resources/12" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" @@ -30,7 +30,7 @@ MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" -MD5_TRAIN_CLEAN_500 = "d1a0fd59409feb2c614ce4d30c387708" +MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') From d2e467385d8367ac072a7d98688466d74661cc4b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 5 Jun 2017 21:00:15 +0800 Subject: [PATCH 10/55] Add loading model function for train.py. --- train.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index e6a7d076..14c7cf63 100644 --- a/train.py +++ b/train.py @@ -11,6 +11,7 @@ import sys from model import deep_speech2 from audio_data_utils import DataGenerator import numpy as np +import os #TODO: add WER metric @@ -78,6 +79,11 @@ parser.add_argument( default='data/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--init_model_path", + default='models/params.tar.gz', + type=str, + help="Model path for initialization. (default: %(default)s)") args = parser.parse_args() @@ -114,8 +120,13 @@ def train(): rnn_size=args.rnn_layer_size, is_inference=False) - # create parameters and optimizer - parameters = paddle.parameters.create(cost) + # create/load parameters and optimizer + if args.init_model_path is None: + parameters = paddle.parameters.create(cost) + else: + assert os.path.isfile(args.init_model_path), "Invalid model." + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.init_model_path)) optimizer = paddle.optimizer.Adam( learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) trainer = paddle.trainer.SGD( From 7c85e0fdb5ffac76df6f3d99519e344be7c9b5dd Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Wed, 7 Jun 2017 16:37:13 +0800 Subject: [PATCH 11/55] Support variable input batch and sortagrad. --- audio_data_utils.py | 56 +++++++++++++++++++++++++++++------------ train.py | 61 ++++++++++++++++----------------------------- 2 files changed, 62 insertions(+), 55 deletions(-) diff --git a/audio_data_utils.py b/audio_data_utils.py index c717bcf1..abb7f1e9 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -8,6 +8,7 @@ import json import random import soundfile import numpy as np +import itertools import os RANDOM_SEED = 0 @@ -62,6 +63,7 @@ class DataGenerator(object): self.__stride_ms__ = stride_ms self.__window_ms__ = window_ms self.__max_frequency__ = max_frequency + self.__epoc__ = 0 self.__random__ = random.Random(RANDOM_SEED) # load vocabulary (dictionary) self.__vocab_dict__, self.__vocab_list__ = \ @@ -245,9 +247,33 @@ class DataGenerator(object): new_batch.append((padded_audio, text)) return new_batch + def __batch_shuffle__(self, manifest, batch_size): + """ + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly remove `k` instances in order to make different mini-batches, + then make minibatches and each minibatch size is batch_size. + 4. Shuffle the minibatches. + + :param manifest: manifest file. + :type manifest: list + :param batch_size: batch size. + :type batch_size: int + """ + manifest.sort(key=lambda x: x["duration"]) + shift_len = self.__random__.randint(0, batch_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) + self.__random__.shuffle(batch_manifest) + batch_manifest = list(sum(batch_manifest, ())) + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest + def instance_reader_creator(self, manifest_path, - sort_by_duration=True, + batch_size, + sortagrad=True, shuffle=False): """ Instance reader creator for audio data. Creat a callable function to @@ -258,18 +284,14 @@ class DataGenerator(object): :param manifest_path: Filepath of manifest for audio clip files. :type manifest_path: basestring - :param sort_by_duration: Sort the audio clips by duration if set True - (for SortaGrad). - :type sort_by_duration: bool + :param sortagrad: Sort the audio clips by duration in the first epoc + if set True. + :type sortagrad: bool :param shuffle: Shuffle the audio clips if set True. :type shuffle: bool :return: Data reader function. :rtype: callable """ - if sort_by_duration and shuffle: - sort_by_duration = False - logger.warn("When shuffle set to true, " - "sort_by_duration is forced to set False.") def reader(): # read manifest @@ -278,16 +300,17 @@ class DataGenerator(object): max_duration=self.__max_duration__, min_duration=self.__min_duration__) # sort (by duration) or shuffle manifest - if sort_by_duration: + if self.__epoc__ == 0 and sortagrad: manifest.sort(key=lambda x: x["duration"]) - if shuffle: - self.__random__.shuffle(manifest) + elif shuffle: + manifest = self.__batch_shuffle__(manifest, batch_size) # extract spectrogram feature for instance in manifest: spectrogram = self.__audio_featurize__( instance["audio_filepath"]) transcript = self.__text_featurize__(instance["text"]) yield (spectrogram, transcript) + self.__epoc__ += 1 return reader @@ -296,7 +319,7 @@ class DataGenerator(object): batch_size, padding_to=-1, flatten=False, - sort_by_duration=True, + sortagrad=False, shuffle=False): """ Batch data reader creator for audio data. Creat a callable function to @@ -317,9 +340,9 @@ class DataGenerator(object): :param flatten: If set True, audio data will be flatten to be a 1-dim ndarray. Otherwise, 2-dim ndarray. Default is False. :type flatten: bool - :param sort_by_duration: Sort the audio clips by duration if set True - (for SortaGrad). - :type sort_by_duration: bool + :param sortagrad: Sort the audio clips by duration in the first epoc + if set True. + :type sortagrad: bool :param shuffle: Shuffle the audio clips if set True. :type shuffle: bool :return: Batch reader function, producing batches of data when called. @@ -329,7 +352,8 @@ class DataGenerator(object): def batch_reader(): instance_reader = self.instance_reader_creator( manifest_path=manifest_path, - sort_by_duration=sort_by_duration, + batch_size=batch_size, + sortagrad=sortagrad, shuffle=shuffle) batch = [] for instance in instance_reader(): diff --git a/train.py b/train.py index e6a7d076..55577b0d 100644 --- a/train.py +++ b/train.py @@ -85,23 +85,27 @@ def train(): """ DeepSpeech2 training. """ + # initialize data generator - data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - normalizer_manifest_path=args.normalizer_manifest_path, - normalizer_num_samples=200, - max_duration=20.0, - min_duration=0.0, - stride_ms=10, - window_ms=20) + def data_generator(): + return DataGenerator( + vocab_filepath=args.vocab_filepath, + normalizer_manifest_path=args.normalizer_manifest_path, + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) + train_generator = data_generator() + test_generator = data_generator() # create network config - dict_size = data_generator.vocabulary_size() + dict_size = train_generator.vocabulary_size() + # paddle.data_type.dense_array is used for variable batch input. + # the size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be set at each batch. audio_data = paddle.layer.data( - name="audio_spectrogram", - height=161, - width=2000, - type=paddle.data_type.dense_vector(322000)) + name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) @@ -122,28 +126,16 @@ def train(): cost=cost, parameters=parameters, update_equation=optimizer) # prepare data reader - train_batch_reader_sortagrad = data_generator.batch_reader_creator( - manifest_path=args.train_manifest_path, - batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=True, - shuffle=False) - train_batch_reader_nosortagrad = data_generator.batch_reader_creator( + train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=False, + sortagrad=True, shuffle=True) - test_batch_reader = data_generator.batch_reader_creator( + test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=False, shuffle=False) - feeding = data_generator.data_name_feeding() + feeding = train_generator.data_name_feeding() # create event handler def event_handler(event): @@ -169,17 +161,8 @@ def train(): time.time() - start_time, event.pass_id, result.cost) # run train - # first pass with sortagrad - if args.use_sortagrad: - trainer.train( - reader=train_batch_reader_sortagrad, - event_handler=event_handler, - num_passes=1, - feeding=feeding) - args.num_passes -= 1 - # other passes without sortagrad trainer.train( - reader=train_batch_reader_nosortagrad, + reader=train_batch_reader, event_handler=event_handler, num_passes=args.num_passes, feeding=feeding) From d3eeb7fd76f8b9f86ca01e80f524dde652211428 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 7 Jun 2017 17:44:11 +0800 Subject: [PATCH 12/55] Refine librispeech.py for DeepSpeech2. Summary: 1. Add manifest line check. 2. Avoid re-unpacking if unpacked data already exists. 3. Add full_download (download all 7 sub-datasets of LibriSpeech). --- README.md | 5 ++- data/librispeech.py | 90 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index bb1815c0..403511d5 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added. ``` cd data python librispeech.py +cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` @@ -32,13 +33,13 @@ python librispeech.py --help For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False +python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all ``` More help for arguments: diff --git a/data/librispeech.py b/data/librispeech.py index 838fee59..8bc33575 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -1,13 +1,15 @@ """ - Download, unpack and create manifest for Librespeech dataset. + Download, unpack and create manifest file for the Librespeech dataset. - Manifest is a json file with each line containing one audio clip filepath, - its transcription text string, and its duration. It servers as a unified - interfance to organize different data sets. + A manifest file is a dataset summarization, with each line a json format + string containing meta data for one audio clip, including its filepath, + transcription string, and duration. It serves as a unified interface for + different data sets. """ import paddle.v2 as paddle from paddle.v2.dataset.common import md5file +import distutils.util import os import wget import tarfile @@ -27,11 +29,21 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" +NUM_LINES_TEST_CLEAN = 2620 +NUM_LINES_TEST_OTHER = 2939 +NUM_LINES_DEV_CLEAN = 2703 +NUM_LINES_DEV_OTHER = 2864 +NUM_LINES_TRAIN_CLEAN_100 = 28539 +NUM_LINES_TRAIN_CLEAN_360 = 104014 +NUM_LINES_TRAIN_OTHER_500 = 148688 + parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -44,6 +56,13 @@ parser.add_argument( default="manifest.libri", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") args = parser.parse_args() @@ -57,7 +76,10 @@ def download(url, md5sum, target_dir): print("Downloading %s ..." % url) wget.download(url, target_dir) print("\nMD5 Chesksum %s ..." % filepath) - assert md5file(filepath) == md5sum, "MD5 checksum failed." + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) return filepath @@ -69,7 +91,6 @@ def unpack(filepath, target_dir): tar = tarfile.open(filepath) tar.extractall(target_dir) tar.close() - return target_dir def create_manifest(data_dir, manifest_path): @@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] - for subfolder, _, filelist in os.walk(data_dir): + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') ] @@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def prepare_dataset(url, md5sum, target_dir, manifest_path): +def verify_file_line_number(filepath, num_lines): + with open(filepath, 'r') as file: + return len(file.readlines()) == num_lines + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): """ Download, unpack and create summmary manifest file. """ + # download filepath = download(url, md5sum, target_dir) - unpacked_dir = unpack(filepath, target_dir) - create_manifest(unpacked_dir, manifest_path) + # unpack + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + unpack(filepath, target_dir) + else: + print("Unpacked data exists, skip unpacking.") + # create manifest and verify line number + create_manifest(target_dir, manifest_path) + if not verify_file_line_number(manifest_path, num_lines): + raise RuntimeError("Manifest line number check failed. " + "Please remove directory and try running the script " + "again.") def main(): @@ -121,17 +157,45 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean") + manifest_path=args.manifest_prefix + ".test-clean", + num_lines=NUM_LINES_TEST_CLEAN) prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean") + manifest_path=args.manifest_prefix + ".dev-clean", + num_lines=NUM_LINES_DEV_CLEAN) prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100") + manifest_path=args.manifest_prefix + ".train-clean-100", + num_lines=NUM_LINES_TRAIN_CLEAN_100) + if args.full_download: + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other", + num_lines=NUM_LINES_TEST_OTHER) + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other", + num_lines=NUM_LINES_DEV_OTHER) + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360", + num_lines=NUM_LINES_TRAIN_CLEAN_360) + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500", + num_lines=NUM_LINES_TRAIN_OTHER_500) if __name__ == '__main__': From f49eab5fec2b478a7822f6459e4a8e7023f65df1 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 7 Jun 2017 19:11:21 +0800 Subject: [PATCH 13/55] Change assert to exception raising. --- train.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 14c7cf63..89ab23c6 100644 --- a/train.py +++ b/train.py @@ -81,9 +81,11 @@ parser.add_argument( help="Vocabulary filepath. (default: %(default)s)") parser.add_argument( "--init_model_path", - default='models/params.tar.gz', + default=None, type=str, - help="Model path for initialization. (default: %(default)s)") + help="If set None, the training will start from scratch. " + "Otherwise, the training will resume from " + "the existing model of this path. (default: %(default)s)") args = parser.parse_args() @@ -124,7 +126,8 @@ def train(): if args.init_model_path is None: parameters = paddle.parameters.create(cost) else: - assert os.path.isfile(args.init_model_path), "Invalid model." + if not os.path.isfile(args.init_model_path): + raise IOError("Invalid model!") parameters = paddle.parameters.Parameters.from_tar( gzip.open(args.init_model_path)) optimizer = paddle.optimizer.Adam( From cc2a4d4e3df7eece1430d9a9ffdda9f104509154 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 8 Jun 2017 17:18:38 +0800 Subject: [PATCH 14/55] Add error rate calculation script. --- error_rate.py | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 error_rate.py diff --git a/error_rate.py b/error_rate.py new file mode 100644 index 00000000..4739238e --- /dev/null +++ b/error_rate.py @@ -0,0 +1,138 @@ +# -- * -- coding: utf-8 -- * -- +import numpy as np + + +def levenshtein_distance(ref, hyp): + ref_len = len(ref) + hyp_len = len(hyp) + + # special case + if ref == hyp: + return 0 + if ref_len == 0: + return hyp_len + if hyp_len == 0: + return ref_len + + distance = np.zeros((ref_len + 1) * (hyp_len + 1), dtype=np.int64) + distance = distance.reshape((ref_len + 1, hyp_len + 1)) + + # initialization distance matrix + for j in xrange(hyp_len + 1): + distance[0][j] = j + for i in xrange(ref_len + 1): + distance[i][0] = i + + # calculate levenshtein distance + for i in xrange(1, ref_len + 1): + for j in xrange(1, hyp_len + 1): + if ref[i - 1] == hyp[j - 1]: + distance[i][j] = distance[i - 1][j - 1] + else: + s_num = distance[i - 1][j - 1] + 1 + i_num = distance[i][j - 1] + 1 + d_num = distance[i - 1][j] + 1 + distance[i][j] = min(s_num, i_num, d_num) + + return distance[ref_len][hyp_len] + + +def wer(reference, hypophysis, delimiter=' ', filter_none=True): + """ + Calculate word error rate (WER). WER is a popular evaluation metric used + in speech recognition. It compares a reference to an hypophysis and + is defined like this: + + .. math:: + WER = (Sw + Dw + Iw) / Nw + + where + + .. code-block:: text + + Sw is the number of words subsituted, + Dw is the number of words deleted, + Iw is the number of words inserted, + Nw is the number of words in the reference + + We can use levenshtein distance to calculate WER. Take an attention that + this function will truncate the beginning and ending delimiter for + reference and hypophysis sentences before calculating WER. + + :param reference: The reference sentence. + :type reference: str + :param hypophysis: The hypophysis sentence. + :type reference: str + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :param filter_none: Whether to remove None value when splitting sentence. + :type filter_none: bool + :return: WER + :rtype: float + """ + + if len(reference.strip(delimiter)) == 0: + raise ValueError("Reference's word number should be greater than 0.") + + if filter_none == True: + ref_words = filter(None, reference.strip(delimiter).split(delimiter)) + hyp_words = filter(None, hypophysis.strip(delimiter).split(delimiter)) + else: + ref_words = reference.strip(delimiter).split(delimiter) + hyp_words = reference.strip(delimiter).split(delimiter) + + edit_distance = levenshtein_distance(ref_words, hyp_words) + wer = float(edit_distance) / len(ref_words) + return wer + + +def cer(reference, hypophysis, squeeze=True, ignore_case=False, strip_char=''): + """ + Calculate charactor error rate (CER). CER will compare reference text and + hypophysis text in char-level. CER is defined as: + + .. math:: + CER = (Sc + Dc + Ic) / Nc + + where + + .. code-block:: text + + Sc is the number of character substituted, + Dc is the number of deleted, + Ic is the number of inserted + Nc is the number of characters in the reference + + We can use levenshtein distance to calculate CER. Chinese input should be + encoded to unicode. + + :param reference: The reference sentence. + :type reference: str + :param hypophysis: The hypophysis sentence. + :type reference: str + :param squeeze: If set true, consecutive space character + will be squeezed to one + :type squeezed: bool + :param ignore_case: Whether ignoring character case. + :type ignore_case: bool + :param strip_char: If not set to '', strip_char in beginning and ending of + sentence will be truncated. + :type strip_char: char + :return: CER + :rtype: float + """ + if ignore_case == True: + reference = reference.lower() + hypophysis = hypophysis.lower() + if strip_char != '': + reference = reference.strip(strip_char) + hypophysis = hypophysis.strip(strip_char) + if squeeze == True: + reference = ' '.join(filter(None, reference.split(' '))) + hypophysis = ' '.join(filter(None, hypophysis.split(' '))) + + if len(reference) == 0: + raise ValueError("Length of reference should be greater than 0.") + edit_distance = levenshtein_distance(reference, hypophysis) + cer = float(edit_distance) / len(reference) + return cer From 3f63e069e098b94fc64d59ac2c297271242cb3c1 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 8 Jun 2017 21:35:17 +0800 Subject: [PATCH 15/55] Fix typos and follow comments. --- error_rate.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/error_rate.py b/error_rate.py index 4739238e..f216177e 100644 --- a/error_rate.py +++ b/error_rate.py @@ -14,8 +14,7 @@ def levenshtein_distance(ref, hyp): if hyp_len == 0: return ref_len - distance = np.zeros((ref_len + 1) * (hyp_len + 1), dtype=np.int64) - distance = distance.reshape((ref_len + 1, hyp_len + 1)) + distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int64) # initialization distance matrix for j in xrange(hyp_len + 1): @@ -40,7 +39,7 @@ def levenshtein_distance(ref, hyp): def wer(reference, hypophysis, delimiter=' ', filter_none=True): """ Calculate word error rate (WER). WER is a popular evaluation metric used - in speech recognition. It compares a reference to an hypophysis and + in speech recognition. It compares a reference with an hypophysis and is defined like this: .. math:: @@ -55,8 +54,8 @@ def wer(reference, hypophysis, delimiter=' ', filter_none=True): Iw is the number of words inserted, Nw is the number of words in the reference - We can use levenshtein distance to calculate WER. Take an attention that - this function will truncate the beginning and ending delimiter for + We can use levenshtein distance to calculate WER. Please draw an attention + that this function will truncate the beginning and ending delimiter for reference and hypophysis sentences before calculating WER. :param reference: The reference sentence. @@ -111,12 +110,12 @@ def cer(reference, hypophysis, squeeze=True, ignore_case=False, strip_char=''): :param hypophysis: The hypophysis sentence. :type reference: str :param squeeze: If set true, consecutive space character - will be squeezed to one - :type squeezed: bool - :param ignore_case: Whether ignoring character case. + will be squeezed to one + :type squeeze: bool + :param ignore_case: Whether case-sensitive or not. :type ignore_case: bool :param strip_char: If not set to '', strip_char in beginning and ending of - sentence will be truncated. + sentence will be truncated. :type strip_char: char :return: CER :rtype: float From 06e9f713899f2118c08753bfe40bd2abf4d152b2 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 8 Jun 2017 22:20:11 +0800 Subject: [PATCH 16/55] Remove manifest's line number check from librispeech.py and update README.md. --- README.md | 4 +++ data/librispeech.py | 69 ++++++++++++++------------------------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 403511d5..7a372e9b 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,10 @@ cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` +After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format. + +By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets. + More help for arguments: ``` diff --git a/data/librispeech.py b/data/librispeech.py index 8bc33575..653caa92 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -1,10 +1,9 @@ """ - Download, unpack and create manifest file for the Librespeech dataset. + Download, unpack and create manifest json files for the Librespeech dataset. - A manifest file is a dataset summarization, with each line a json format - string containing meta data for one audio clip, including its filepath, - transcription string, and duration. It serves as a unified interface for - different data sets. + A manifest is a json file summarizing filelist in a data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file in the data set. """ import paddle.v2 as paddle @@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" -NUM_LINES_TEST_CLEAN = 2620 -NUM_LINES_TEST_OTHER = 2939 -NUM_LINES_DEV_CLEAN = 2703 -NUM_LINES_DEV_OTHER = 2864 -NUM_LINES_TRAIN_CLEAN_100 = 28539 -NUM_LINES_TRAIN_CLEAN_360 = 104014 -NUM_LINES_TRAIN_OTHER_500 = 148688 - parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -95,12 +86,9 @@ def unpack(filepath, target_dir): def create_manifest(data_dir, manifest_path): """ - Create a manifest file summarizing the dataset (list of filepath and meta - data). - - Each line of the manifest contains one audio clip filepath, its - transcription text string, and its duration. Manifest file servers as a - unified interfance to organize data sets. + Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. """ print("Creating manifest %s ..." % manifest_path) json_lines = [] @@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def verify_file_line_number(filepath, num_lines): - with open(filepath, 'r') as file: - return len(file.readlines()) == num_lines - - -def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): +def prepare_dataset(url, md5sum, target_dir, manifest_path): """ Download, unpack and create summmary manifest file. """ - # download - filepath = download(url, md5sum, target_dir) - # unpack if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack unpack(filepath, target_dir) else: - print("Unpacked data exists, skip unpacking.") - # create manifest and verify line number + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file create_manifest(target_dir, manifest_path) - if not verify_file_line_number(manifest_path, num_lines): - raise RuntimeError("Manifest line number check failed. " - "Please remove directory and try running the script " - "again.") def main(): @@ -157,45 +137,38 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean", - num_lines=NUM_LINES_TEST_CLEAN) + manifest_path=args.manifest_prefix + ".test-clean") prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean", - num_lines=NUM_LINES_DEV_CLEAN) + manifest_path=args.manifest_prefix + ".dev-clean") prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100", - num_lines=NUM_LINES_TRAIN_CLEAN_100) + manifest_path=args.manifest_prefix + ".train-clean-100") if args.full_download: prepare_dataset( url=URL_TEST_OTHER, md5sum=MD5_TEST_OTHER, target_dir=os.path.join(args.target_dir, "test-other"), - manifest_path=args.manifest_prefix + ".test-other", - num_lines=NUM_LINES_TEST_OTHER) + manifest_path=args.manifest_prefix + ".test-other") prepare_dataset( url=URL_DEV_OTHER, md5sum=MD5_DEV_OTHER, target_dir=os.path.join(args.target_dir, "dev-other"), - manifest_path=args.manifest_prefix + ".dev-other", - num_lines=NUM_LINES_DEV_OTHER) + manifest_path=args.manifest_prefix + ".dev-other") prepare_dataset( url=URL_TRAIN_CLEAN_360, md5sum=MD5_TRAIN_CLEAN_360, target_dir=os.path.join(args.target_dir, "train-clean-360"), - manifest_path=args.manifest_prefix + ".train-clean-360", - num_lines=NUM_LINES_TRAIN_CLEAN_360) + manifest_path=args.manifest_prefix + ".train-clean-360") prepare_dataset( url=URL_TRAIN_OTHER_500, md5sum=MD5_TRAIN_OTHER_500, target_dir=os.path.join(args.target_dir, "train-other-500"), - manifest_path=args.manifest_prefix + ".train-other-500", - num_lines=NUM_LINES_TRAIN_OTHER_500) + manifest_path=args.manifest_prefix + ".train-other-500") if __name__ == '__main__': From c25c62b8f9544e488bdc696c6e8021e09661eb42 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 12 Jun 2017 19:06:55 +0800 Subject: [PATCH 17/55] refine audio_data_utils.py --- audio_data_utils.py | 68 +++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/audio_data_utils.py b/audio_data_utils.py index abb7f1e9..692a4280 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -247,22 +247,25 @@ class DataGenerator(object): new_batch.append((padded_audio, text)) return new_batch - def __batch_shuffle__(self, manifest, batch_size): + def __batch_shuffle__(self, manifest, batch_shuffle_size): """ 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_size). + 2. Generate a random number `k`, k in [0, batch_shuffle_size). 3. Randomly remove `k` instances in order to make different mini-batches, - then make minibatches and each minibatch size is batch_size. + then make minibatches and each minibatch size is batch_shuffle_size. 4. Shuffle the minibatches. :param manifest: manifest file. :type manifest: list - :param batch_size: batch size. - :type batch_size: int + :param batch_shuffle_size: This size is uesed to generate a random number, + it usually equals to batch size. + :type batch_shuffle_size: int + :return: batch shuffled mainifest. + :rtype: list """ manifest.sort(key=lambda x: x["duration"]) - shift_len = self.__random__.randint(0, batch_size - 1) - batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) + shift_len = self.__random__.randint(0, batch_shuffle_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_shuffle_size) self.__random__.shuffle(batch_manifest) batch_manifest = list(sum(batch_manifest, ())) res_len = len(manifest) - shift_len - len(batch_manifest) @@ -270,11 +273,7 @@ class DataGenerator(object): batch_manifest.extend(manifest[0:shift_len]) return batch_manifest - def instance_reader_creator(self, - manifest_path, - batch_size, - sortagrad=True, - shuffle=False): + def instance_reader_creator(self, manifest): """ Instance reader creator for audio data. Creat a callable function to produce instances of data. @@ -282,35 +281,19 @@ class DataGenerator(object): Instance: a tuple of a numpy ndarray of audio spectrogram and a list of tokenized and indexed transcription text. - :param manifest_path: Filepath of manifest for audio clip files. - :type manifest_path: basestring - :param sortagrad: Sort the audio clips by duration in the first epoc - if set True. - :type sortagrad: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool + :param manifest: Filepath of manifest for audio clip files. + :type manifest: basestring :return: Data reader function. :rtype: callable """ def reader(): - # read manifest - manifest = self.__read_manifest__( - manifest_path=manifest_path, - max_duration=self.__max_duration__, - min_duration=self.__min_duration__) - # sort (by duration) or shuffle manifest - if self.__epoc__ == 0 and sortagrad: - manifest.sort(key=lambda x: x["duration"]) - elif shuffle: - manifest = self.__batch_shuffle__(manifest, batch_size) # extract spectrogram feature for instance in manifest: spectrogram = self.__audio_featurize__( instance["audio_filepath"]) transcript = self.__text_featurize__(instance["text"]) yield (spectrogram, transcript) - self.__epoc__ += 1 return reader @@ -320,7 +303,7 @@ class DataGenerator(object): padding_to=-1, flatten=False, sortagrad=False, - shuffle=False): + batch_shuffle=False): """ Batch data reader creator for audio data. Creat a callable function to produce batches of data. @@ -343,18 +326,28 @@ class DataGenerator(object): :param sortagrad: Sort the audio clips by duration in the first epoc if set True. :type sortagrad: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool + :param batch_shuffle: Shuffle the audio clips if set True. It is + not a thorough instance-wise shuffle, + but a specific batch-wise shuffle. + :type batch_shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): - instance_reader = self.instance_reader_creator( + # read manifest + manifest = self.__read_manifest__( manifest_path=manifest_path, - batch_size=batch_size, - sortagrad=sortagrad, - shuffle=shuffle) + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + + # sort (by duration) or shuffle manifest + if self.__epoc__ == 0 and sortagrad: + manifest.sort(key=lambda x: x["duration"]) + elif batch_shuffle: + manifest = self.__batch_shuffle__(manifest, batch_size) + + instance_reader = self.instance_reader_creator(manifest) batch = [] for instance in instance_reader(): batch.append(instance) @@ -363,6 +356,7 @@ class DataGenerator(object): batch = [] if len(batch) > 0: yield self.__padding_batch__(batch, padding_to, flatten) + self.__epoc__ += 1 return batch_reader From 9c27b1d14e601ff64df6e5dacc95d77933e2b39a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 12 Jun 2017 19:53:41 +0800 Subject: [PATCH 18/55] add more comments and update train.py --- audio_data_utils.py | 30 ++++++++++++++++++++---------- train.py | 6 +++--- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/audio_data_utils.py b/audio_data_utils.py index 692a4280..1cd29be1 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -247,25 +247,34 @@ class DataGenerator(object): new_batch.append((padded_audio, text)) return new_batch - def __batch_shuffle__(self, manifest, batch_shuffle_size): + def __batch_shuffle__(self, manifest, batch_size): """ + The instances have different lengths and they cannot be + combined into a single matrix multiplication. It usually + sorts the training examples by length and combines only + similarly-sized instances into minibatches, pads with + silence when necessary so that all instances in a batch + have the same length. This batch shuffle fuction is used + to make similarly-sized instances into minibatches and + make a batch-wise shuffle. + 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_shuffle_size). + 2. Generate a random number `k`, k in [0, batch_size). 3. Randomly remove `k` instances in order to make different mini-batches, - then make minibatches and each minibatch size is batch_shuffle_size. + then make minibatches and each minibatch size is batch_size. 4. Shuffle the minibatches. :param manifest: manifest file. :type manifest: list - :param batch_shuffle_size: This size is uesed to generate a random number, - it usually equals to batch size. - :type batch_shuffle_size: int + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int :return: batch shuffled mainifest. :rtype: list """ manifest.sort(key=lambda x: x["duration"]) - shift_len = self.__random__.randint(0, batch_shuffle_size - 1) - batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_shuffle_size) + shift_len = self.__random__.randint(0, batch_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) self.__random__.shuffle(batch_manifest) batch_manifest = list(sum(batch_manifest, ())) res_len = len(manifest) - shift_len - len(batch_manifest) @@ -327,8 +336,9 @@ class DataGenerator(object): if set True. :type sortagrad: bool :param batch_shuffle: Shuffle the audio clips if set True. It is - not a thorough instance-wise shuffle, - but a specific batch-wise shuffle. + not a thorough instance-wise shuffle, but a + specific batch-wise shuffle. For more details, + please see `__batch_shuffle__` function. :type batch_shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable diff --git a/train.py b/train.py index eb9b56de..957c2426 100644 --- a/train.py +++ b/train.py @@ -143,12 +143,12 @@ def train(): train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, - sortagrad=True, - shuffle=True) + sortagrad=True if args.init_model_path is None else False, + batch_shuffle=True) test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, - shuffle=False) + batch_shuffle=False) feeding = train_generator.data_name_feeding() # create event handler From cd3617aeb4df0dbe998060ba410c782856b2abf3 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 12 Jun 2017 23:19:40 +0800 Subject: [PATCH 19/55] Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. --- audio_data_utils.py | 411 ------------------ compute_mean_std.py | 56 +++ data_utils/__init__.py | 0 data_utils/audio.py | 68 +++ data_utils/augmentor/__init__.py | 0 data_utils/augmentor/augmentation.py | 38 ++ data_utils/augmentor/base.py | 17 + data_utils/augmentor/volumn_perturb.py | 17 + data_utils/data.py | 247 +++++++++++ data_utils/featurizer/__init__.py | 0 data_utils/featurizer/audio_featurizer.py | 86 ++++ data_utils/featurizer/speech_featurizer.py | 32 ++ data_utils/featurizer/text_featurizer.py | 39 ++ data_utils/normalizer.py | 49 +++ data_utils/utils.py | 19 + {data => datasets/librispeech}/librispeech.py | 2 +- datasets/run_all.sh | 13 + {data => datasets/vocab}/eng_vocab.txt | 0 infer.py | 61 ++- train.py | 74 ++-- 20 files changed, 750 insertions(+), 479 deletions(-) delete mode 100644 audio_data_utils.py create mode 100755 compute_mean_std.py create mode 100755 data_utils/__init__.py create mode 100755 data_utils/audio.py create mode 100755 data_utils/augmentor/__init__.py create mode 100755 data_utils/augmentor/augmentation.py create mode 100755 data_utils/augmentor/base.py create mode 100755 data_utils/augmentor/volumn_perturb.py create mode 100644 data_utils/data.py create mode 100755 data_utils/featurizer/__init__.py create mode 100755 data_utils/featurizer/audio_featurizer.py create mode 100755 data_utils/featurizer/speech_featurizer.py create mode 100755 data_utils/featurizer/text_featurizer.py create mode 100755 data_utils/normalizer.py create mode 100755 data_utils/utils.py rename {data => datasets/librispeech}/librispeech.py (99%) create mode 100755 datasets/run_all.sh rename {data => datasets/vocab}/eng_vocab.txt (100%) diff --git a/audio_data_utils.py b/audio_data_utils.py deleted file mode 100644 index 1cd29be1..00000000 --- a/audio_data_utils.py +++ /dev/null @@ -1,411 +0,0 @@ -""" - Providing basic audio data preprocessing pipeline, and offering - both instance-level and batch-level data reader interfaces. -""" -import paddle.v2 as paddle -import logging -import json -import random -import soundfile -import numpy as np -import itertools -import os - -RANDOM_SEED = 0 -logger = logging.getLogger(__name__) - - -class DataGenerator(object): - """ - DataGenerator provides basic audio data preprocessing pipeline, and offers - both instance-level and batch-level data reader interfaces. - Normalized FFT are used as audio features here. - - :param vocab_filepath: Vocabulary file path for indexing tokenized - transcriptions. - :type vocab_filepath: basestring - :param normalizer_manifest_path: Manifest filepath for collecting feature - normalization statistics, e.g. mean, std. - :type normalizer_manifest_path: basestring - :param normalizer_num_samples: Number of instances sampled for collecting - feature normalization statistics. - Default is 100. - :type normalizer_num_samples: int - :param max_duration: Audio clips with duration (in seconds) greater than - this will be discarded. Default is 20.0. - :type max_duration: float - :param min_duration: Audio clips with duration (in seconds) smaller than - this will be discarded. Default is 0.0. - :type min_duration: float - :param stride_ms: Striding size (in milliseconds) for generating frames. - Default is 10.0. - :type stride_ms: float - :param window_ms: Window size (in milliseconds) for frames. Default is 20.0. - :type window_ms: float - :param max_frequency: Maximun frequency for FFT features. FFT features of - frequency larger than this will be discarded. - If set None, all features will be kept. - Default is None. - :type max_frequency: float - """ - - def __init__(self, - vocab_filepath, - normalizer_manifest_path, - normalizer_num_samples=100, - max_duration=20.0, - min_duration=0.0, - stride_ms=10.0, - window_ms=20.0, - max_frequency=None): - self.__max_duration__ = max_duration - self.__min_duration__ = min_duration - self.__stride_ms__ = stride_ms - self.__window_ms__ = window_ms - self.__max_frequency__ = max_frequency - self.__epoc__ = 0 - self.__random__ = random.Random(RANDOM_SEED) - # load vocabulary (dictionary) - self.__vocab_dict__, self.__vocab_list__ = \ - self.__load_vocabulary_from_file__(vocab_filepath) - # collect normalizer statistics - self.__mean__, self.__std__ = self.__collect_normalizer_statistics__( - manifest_path=normalizer_manifest_path, - num_samples=normalizer_num_samples) - - def __audio_featurize__(self, audio_filename): - """ - Preprocess audio data, including feature extraction, normalization etc.. - """ - features = self.__audio_basic_featurize__(audio_filename) - return self.__normalize__(features) - - def __text_featurize__(self, text): - """ - Preprocess text data, including tokenizing and token indexing etc.. - """ - return self.__convert_text_to_char_index__( - text=text, vocabulary=self.__vocab_dict__) - - def __audio_basic_featurize__(self, audio_filename): - """ - Compute basic (without normalization etc.) features for audio data. - """ - return self.__spectrogram_from_file__( - filename=audio_filename, - stride_ms=self.__stride_ms__, - window_ms=self.__window_ms__, - max_freq=self.__max_frequency__) - - def __collect_normalizer_statistics__(self, manifest_path, num_samples=100): - """ - Compute feature normalization statistics, i.e. mean and stddev. - """ - # read manifest - manifest = self.__read_manifest__( - manifest_path=manifest_path, - max_duration=self.__max_duration__, - min_duration=self.__min_duration__) - # sample for statistics - sampled_manifest = self.__random__.sample(manifest, num_samples) - # extract spectrogram feature - features = [] - for instance in sampled_manifest: - spectrogram = self.__audio_basic_featurize__( - instance["audio_filepath"]) - features.append(spectrogram) - features = np.hstack(features) - mean = np.mean(features, axis=1).reshape([-1, 1]) - std = np.std(features, axis=1).reshape([-1, 1]) - return mean, std - - def __normalize__(self, features, eps=1e-14): - """ - Normalize features to be of zero mean and unit stddev. - """ - return (features - self.__mean__) / (self.__std__ + eps) - - def __spectrogram_from_file__(self, - filename, - stride_ms=10.0, - window_ms=20.0, - max_freq=None, - eps=1e-14): - """ - Laod audio data and calculate the log of spectrogram by FFT. - Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech - """ - audio, sample_rate = soundfile.read(filename) - if audio.ndim >= 2: - audio = np.mean(audio, 1) - if max_freq is None: - max_freq = sample_rate / 2 - if max_freq > sample_rate / 2: - raise ValueError("max_freq must be greater than half of " - "sample rate.") - if stride_ms > window_ms: - raise ValueError("Stride size must not be greater than " - "window size.") - stride_size = int(0.001 * sample_rate * stride_ms) - window_size = int(0.001 * sample_rate * window_ms) - spectrogram, freqs = self.__extract_spectrogram__( - audio, - window_size=window_size, - stride_size=stride_size, - sample_rate=sample_rate) - ind = np.where(freqs <= max_freq)[0][-1] + 1 - return np.log(spectrogram[:ind, :] + eps) - - def __extract_spectrogram__(self, samples, window_size, stride_size, - sample_rate): - """ - Compute the spectrogram by FFT for a discrete real signal. - Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech - """ - # extract strided windows - truncate_size = (len(samples) - window_size) % stride_size - samples = samples[:len(samples) - truncate_size] - nshape = (window_size, (len(samples) - window_size) // stride_size + 1) - nstrides = (samples.strides[0], samples.strides[0] * stride_size) - windows = np.lib.stride_tricks.as_strided( - samples, shape=nshape, strides=nstrides) - assert np.all( - windows[:, 1] == samples[stride_size:(stride_size + window_size)]) - # window weighting, squared Fast Fourier Transform (fft), scaling - weighting = np.hanning(window_size)[:, None] - fft = np.fft.rfft(windows * weighting, axis=0) - fft = np.absolute(fft)**2 - scale = np.sum(weighting**2) * sample_rate - fft[1:-1, :] *= (2.0 / scale) - fft[(0, -1), :] /= scale - # prepare fft frequency list - freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) - return fft, freqs - - def __load_vocabulary_from_file__(self, vocabulary_path): - """ - Load vocabulary from file. - """ - if not os.path.exists(vocabulary_path): - raise ValueError("Vocabulary file %s not found.", vocabulary_path) - vocab_lines = [] - with open(vocabulary_path, 'r') as file: - vocab_lines.extend(file.readlines()) - vocab_list = [line[:-1] for line in vocab_lines] - vocab_dict = dict( - [(token, id) for (id, token) in enumerate(vocab_list)]) - return vocab_dict, vocab_list - - def __convert_text_to_char_index__(self, text, vocabulary): - """ - Convert text string to a list of character index integers. - """ - return [vocabulary[w] for w in text] - - def __read_manifest__(self, manifest_path, max_duration, min_duration): - """ - Load and parse manifest file. - """ - manifest = [] - for json_line in open(manifest_path): - try: - json_data = json.loads(json_line) - except Exception as e: - raise ValueError("Error reading manifest: %s" % str(e)) - if (json_data["duration"] <= max_duration and - json_data["duration"] >= min_duration): - manifest.append(json_data) - return manifest - - def __padding_batch__(self, batch, padding_to=-1, flatten=False): - """ - Padding audio part of features (only in the time axis -- column axis) - with zeros, to make each instance in the batch share the same - audio feature shape. - - If `padding_to` is set -1, the maximun column numbers in the batch will - be used as the target size. Otherwise, `padding_to` will be the target - size. Default is -1. - - If `flatten` is set True, audio data will be flatten to be a 1-dim - ndarray. Default is False. - """ - new_batch = [] - # get target shape - max_length = max([audio.shape[1] for audio, text in batch]) - if padding_to != -1: - if padding_to < max_length: - raise ValueError("If padding_to is not -1, it should be greater" - " or equal to the original instance length.") - max_length = padding_to - # padding - for audio, text in batch: - padded_audio = np.zeros([audio.shape[0], max_length]) - padded_audio[:, :audio.shape[1]] = audio - if flatten: - padded_audio = padded_audio.flatten() - new_batch.append((padded_audio, text)) - return new_batch - - def __batch_shuffle__(self, manifest, batch_size): - """ - The instances have different lengths and they cannot be - combined into a single matrix multiplication. It usually - sorts the training examples by length and combines only - similarly-sized instances into minibatches, pads with - silence when necessary so that all instances in a batch - have the same length. This batch shuffle fuction is used - to make similarly-sized instances into minibatches and - make a batch-wise shuffle. - - 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_size). - 3. Randomly remove `k` instances in order to make different mini-batches, - then make minibatches and each minibatch size is batch_size. - 4. Shuffle the minibatches. - - :param manifest: manifest file. - :type manifest: list - :param batch_size: Batch size. This size is also used for generate - a random number for batch shuffle. - :type batch_size: int - :return: batch shuffled mainifest. - :rtype: list - """ - manifest.sort(key=lambda x: x["duration"]) - shift_len = self.__random__.randint(0, batch_size - 1) - batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) - self.__random__.shuffle(batch_manifest) - batch_manifest = list(sum(batch_manifest, ())) - res_len = len(manifest) - shift_len - len(batch_manifest) - batch_manifest.extend(manifest[-res_len:]) - batch_manifest.extend(manifest[0:shift_len]) - return batch_manifest - - def instance_reader_creator(self, manifest): - """ - Instance reader creator for audio data. Creat a callable function to - produce instances of data. - - Instance: a tuple of a numpy ndarray of audio spectrogram and a list of - tokenized and indexed transcription text. - - :param manifest: Filepath of manifest for audio clip files. - :type manifest: basestring - :return: Data reader function. - :rtype: callable - """ - - def reader(): - # extract spectrogram feature - for instance in manifest: - spectrogram = self.__audio_featurize__( - instance["audio_filepath"]) - transcript = self.__text_featurize__(instance["text"]) - yield (spectrogram, transcript) - - return reader - - def batch_reader_creator(self, - manifest_path, - batch_size, - padding_to=-1, - flatten=False, - sortagrad=False, - batch_shuffle=False): - """ - Batch data reader creator for audio data. Creat a callable function to - produce batches of data. - - Audio features will be padded with zeros to make each instance in the - batch to share the same audio feature shape. - - :param manifest_path: Filepath of manifest for audio clip files. - :type manifest_path: basestring - :param batch_size: Instance number in a batch. - :type batch_size: int - :param padding_to: If set -1, the maximun column numbers in the batch - will be used as the target size for padding. - Otherwise, `padding_to` will be the target size. - Default is -1. - :type padding_to: int - :param flatten: If set True, audio data will be flatten to be a 1-dim - ndarray. Otherwise, 2-dim ndarray. Default is False. - :type flatten: bool - :param sortagrad: Sort the audio clips by duration in the first epoc - if set True. - :type sortagrad: bool - :param batch_shuffle: Shuffle the audio clips if set True. It is - not a thorough instance-wise shuffle, but a - specific batch-wise shuffle. For more details, - please see `__batch_shuffle__` function. - :type batch_shuffle: bool - :return: Batch reader function, producing batches of data when called. - :rtype: callable - """ - - def batch_reader(): - # read manifest - manifest = self.__read_manifest__( - manifest_path=manifest_path, - max_duration=self.__max_duration__, - min_duration=self.__min_duration__) - - # sort (by duration) or shuffle manifest - if self.__epoc__ == 0 and sortagrad: - manifest.sort(key=lambda x: x["duration"]) - elif batch_shuffle: - manifest = self.__batch_shuffle__(manifest, batch_size) - - instance_reader = self.instance_reader_creator(manifest) - batch = [] - for instance in instance_reader(): - batch.append(instance) - if len(batch) == batch_size: - yield self.__padding_batch__(batch, padding_to, flatten) - batch = [] - if len(batch) > 0: - yield self.__padding_batch__(batch, padding_to, flatten) - self.__epoc__ += 1 - - return batch_reader - - def vocabulary_size(self): - """ - Get vocabulary size. - - :return: Vocabulary size. - :rtype: int - """ - return len(self.__vocab_list__) - - def vocabulary_dict(self): - """ - Get vocabulary in dict. - - :return: Vocabulary in dict. - :rtype: dict - """ - return self.__vocab_dict__ - - def vocabulary_list(self): - """ - Get vocabulary in list. - - :return: Vocabulary in list - :rtype: list - """ - return self.__vocab_list__ - - def data_name_feeding(self): - """ - Get feeddings (data field name and corresponding field id). - - :return: Feeding dict. - :rtype: dict - """ - feeding = { - "audio_spectrogram": 0, - "transcript_text": 1, - } - return feeding diff --git a/compute_mean_std.py b/compute_mean_std.py new file mode 100755 index 00000000..b3015df7 --- /dev/null +++ b/compute_mean_std.py @@ -0,0 +1,56 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +from data_utils.normalizer import FeatureNormalizer +from data_utils.augmentor.augmentation import AugmentationPipeline +from data_utils.featurizer.audio_featurizer import AudioFeaturizer + +parser = argparse.ArgumentParser( + description='Computing mean and stddev for feature normalizer.') +parser.add_argument( + "--manifest_path", + default='datasets/manifest.train', + type=str, + help="Manifest path for computing normalizer's mean and stddev." + "(default: %(default)s)") +parser.add_argument( + "--num_samples", + default=500, + type=int, + help="Number of samples for computing mean and stddev. " + "(default: %(default)s)") +parser.add_argument( + "--augmentation_config", + default='{}', + type=str, + help="Augmentation configuration in json-format. " + "(default: %(default)s)") +parser.add_argument( + "--output_file", + default='mean_std.npz', + type=str, + help="Filepath to write mean and std to (.npz)." + "(default: %(default)s)") +args = parser.parse_args() + + +def main(): + augmentation_pipeline = AugmentationPipeline(args.augmentation_config) + audio_featurizer = AudioFeaturizer() + + def augment_and_featurize(audio_segment): + augmentation_pipeline.transform_audio(audio_segment) + return audio_featurizer.featurize(audio_segment) + + normalizer = FeatureNormalizer( + mean_std_filepath=None, + manifest_path=args.manifest_path, + featurize_func=augment_and_featurize, + num_samples=args.num_samples) + normalizer.write_to_file(args.output_file) + + +if __name__ == '__main__': + main() diff --git a/data_utils/__init__.py b/data_utils/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/data_utils/audio.py b/data_utils/audio.py new file mode 100755 index 00000000..46b24120 --- /dev/null +++ b/data_utils/audio.py @@ -0,0 +1,68 @@ +import numpy as np +import io +import soundfile + + +class AudioSegment(object): + """Monaural audio segment abstraction. + """ + + def __init__(self, samples, sample_rate): + if not samples.dtype == np.float32: + raise ValueError("Sample data type of [%s] is not supported.") + self._samples = samples + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + @classmethod + def from_file(cls, filepath): + samples, sample_rate = soundfile.read(filepath, dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def from_bytes(cls, bytes): + samples, sample_rate = soundfile.read( + io.BytesIO(bytes), dtype='float32') + return cls(samples, sample_rate) + + def apply_gain(self, gain): + self.samples *= 10.**(gain / 20.) + + def resample(self, target_sample_rate): + raise NotImplementedError() + + def change_speed(self, rate): + raise NotImplementedError() + + @property + def samples(self): + return self._samples.copy() + + @property + def sample_rate(self): + return self._sample_rate + + @property + def duration(self): + return self._samples.shape[0] / float(self._sample_rate) + + +class SpeechSegment(AudioSegment): + def __init__(self, samples, sample_rate, transcript): + AudioSegment.__init__(self, samples, sample_rate) + self._transcript = transcript + + @classmethod + def from_file(cls, filepath, transcript): + audio = AudioSegment.from_file(filepath) + return cls(audio.samples, audio.sample_rate, transcript) + + @classmethod + def from_bytes(cls, bytes, transcript): + audio = AudioSegment.from_bytes(bytes) + return cls(audio.samples, audio.sample_rate, transcript) + + @property + def transcript(self): + return self._transcript diff --git a/data_utils/augmentor/__init__.py b/data_utils/augmentor/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py new file mode 100755 index 00000000..3a1426a1 --- /dev/null +++ b/data_utils/augmentor/augmentation.py @@ -0,0 +1,38 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import random +from data_utils.augmentor.volumn_perturb import VolumnPerturbAugmentor + + +class AugmentationPipeline(object): + def __init__(self, augmentation_config, random_seed=0): + self._rng = random.Random(random_seed) + self._augmentors, self._rates = self._parse_pipeline_from( + augmentation_config) + + def transform_audio(self, audio_segment): + for augmentor, rate in zip(self._augmentors, self._rates): + if self._rng.uniform(0., 1.) <= rate: + augmentor.transform_audio(audio_segment) + + def _parse_pipeline_from(self, config_json): + try: + configs = json.loads(config_json) + except Exception as e: + raise ValueError("Augmentation config json format error: " + "%s" % str(e)) + augmentors = [ + self._get_augmentor(config["type"], config["params"]) + for config in configs + ] + rates = [config["rate"] for config in configs] + return augmentors, rates + + def _get_augmentor(self, augmentor_type, params): + if augmentor_type == "volumn": + return VolumnPerturbAugmentor(self._rng, **params) + else: + raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/base.py b/data_utils/augmentor/base.py new file mode 100755 index 00000000..e801b9b1 --- /dev/null +++ b/data_utils/augmentor/base.py @@ -0,0 +1,17 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from abc import ABCMeta, abstractmethod + + +class AugmentorBase(object): + __metaclass__ = ABCMeta + + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def transform_audio(self, audio_segment): + pass diff --git a/data_utils/augmentor/volumn_perturb.py b/data_utils/augmentor/volumn_perturb.py new file mode 100755 index 00000000..dd1ba53a --- /dev/null +++ b/data_utils/augmentor/volumn_perturb.py @@ -0,0 +1,17 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import random +from data_utils.augmentor.base import AugmentorBase + + +class VolumnPerturbAugmentor(AugmentorBase): + def __init__(self, rng, min_gain_dBFS, max_gain_dBFS): + self._min_gain_dBFS = min_gain_dBFS + self._max_gain_dBFS = max_gain_dBFS + self._rng = rng + + def transform_audio(self, audio_segment): + gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS) + audio_segment.apply_gain(gain) diff --git a/data_utils/data.py b/data_utils/data.py new file mode 100644 index 00000000..63000793 --- /dev/null +++ b/data_utils/data.py @@ -0,0 +1,247 @@ +""" + Providing basic audio data preprocessing pipeline, and offering + both instance-level and batch-level data reader interfaces. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import random +import numpy as np +import paddle.v2 as paddle +from data_utils import utils +from data_utils.augmentor.augmentation import AugmentationPipeline +from data_utils.featurizer.speech_featurizer import SpeechFeaturizer +from data_utils.audio import SpeechSegment +from data_utils.normalizer import FeatureNormalizer + + +class DataGenerator(object): + """ + DataGenerator provides basic audio data preprocessing pipeline, and offers + both instance-level and batch-level data reader interfaces. + Normalized FFT are used as audio features here. + + :param vocab_filepath: Vocabulary file path for indexing tokenized + transcriptions. + :type vocab_filepath: basestring + :param normalizer_manifest_path: Manifest filepath for collecting feature + normalization statistics, e.g. mean, std. + :type normalizer_manifest_path: basestring + :param normalizer_num_samples: Number of instances sampled for collecting + feature normalization statistics. + Default is 100. + :type normalizer_num_samples: int + :param max_duration: Audio clips with duration (in seconds) greater than + this will be discarded. Default is 20.0. + :type max_duration: float + :param min_duration: Audio clips with duration (in seconds) smaller than + this will be discarded. Default is 0.0. + :type min_duration: float + :param stride_ms: Striding size (in milliseconds) for generating frames. + Default is 10.0. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for frames. Default is 20.0. + :type window_ms: float + :param max_frequency: Maximun frequency for FFT features. FFT features of + frequency larger than this will be discarded. + If set None, all features will be kept. + Default is None. + :type max_frequency: float + """ + + def __init__(self, + vocab_filepath, + mean_std_filepath, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + random_seed=0): + self._max_duration = max_duration + self._min_duration = min_duration + self._normalizer = FeatureNormalizer(mean_std_filepath) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=augmentation_config, random_seed=random_seed) + self._speech_featurizer = SpeechFeaturizer( + vocab_filepath=vocab_filepath, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + random_seed=random_seed) + self._rng = random.Random(random_seed) + self._epoch = 0 + + def batch_reader_creator(self, + manifest_path, + batch_size, + padding_to=-1, + flatten=False, + sortagrad=False, + batch_shuffle=False): + """ + Batch data reader creator for audio data. Creat a callable function to + produce batches of data. + + Audio features will be padded with zeros to make each instance in the + batch to share the same audio feature shape. + + :param manifest_path: Filepath of manifest for audio clip files. + :type manifest_path: basestring + :param batch_size: Instance number in a batch. + :type batch_size: int + :param padding_to: If set -1, the maximun column numbers in the batch + will be used as the target size for padding. + Otherwise, `padding_to` will be the target size. + Default is -1. + :type padding_to: int + :param flatten: If set True, audio data will be flatten to be a 1-dim + ndarray. Otherwise, 2-dim ndarray. Default is False. + :type flatten: bool + :param sortagrad: Sort the audio clips by duration in the first epoc + if set True. + :type sortagrad: bool + :param batch_shuffle: Shuffle the audio clips if set True. It is + not a thorough instance-wise shuffle, but a + specific batch-wise shuffle. For more details, + please see `_batch_shuffle` function. + :type batch_shuffle: bool + :return: Batch reader function, producing batches of data when called. + :rtype: callable + """ + + def batch_reader(): + # read manifest + manifest = utils.read_manifest( + manifest_path=manifest_path, + max_duration=self._max_duration, + min_duration=self._min_duration) + # sort (by duration) or batch-wise shuffle the manifest + if self._epoch == 0 and sortagrad: + manifest.sort(key=lambda x: x["duration"]) + elif batch_shuffle: + manifest = self._batch_shuffle(manifest, batch_size) + # prepare batches + instance_reader = self._instance_reader_creator(manifest) + batch = [] + for instance in instance_reader(): + batch.append(instance) + if len(batch) == batch_size: + yield self._padding_batch(batch, padding_to, flatten) + batch = [] + if len(batch) > 0: + yield self._padding_batch(batch, padding_to, flatten) + self._epoch += 1 + + return batch_reader + + @property + def feeding(self): + """Returns data_reader's feeding dict.""" + return {"audio_spectrogram": 0, "transcript_text": 1} + + @property + def vocab_size(self): + """Returns vocabulary size.""" + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + """Returns vocabulary list.""" + return self._speech_featurizer.vocab_list + + def _process_utterance(self, filename, transcript): + speech_segment = SpeechSegment.from_file(filename, transcript) + self._augmentation_pipeline.transform_audio(speech_segment) + specgram, text_ids = self._speech_featurizer.featurize(speech_segment) + specgram = self._normalizer.apply(specgram) + return specgram, text_ids + + def _instance_reader_creator(self, manifest): + """ + Instance reader creator for audio data. Creat a callable function to + produce instances of data. + + Instance: a tuple of a numpy ndarray of audio spectrogram and a list of + tokenized and indexed transcription text. + + :param manifest: Filepath of manifest for audio clip files. + :type manifest: basestring + :return: Data reader function. + :rtype: callable + """ + + def reader(): + for instance in manifest: + yield self._process_utterance(instance["audio_filepath"], + instance["text"]) + + return reader + + def _padding_batch(self, batch, padding_to=-1, flatten=False): + """ + Padding audio part of features (only in the time axis -- column axis) + with zeros, to make each instance in the batch share the same + audio feature shape. + + If `padding_to` is set -1, the maximun column numbers in the batch will + be used as the target size. Otherwise, `padding_to` will be the target + size. Default is -1. + + If `flatten` is set True, audio data will be flatten to be a 1-dim + ndarray. Default is False. + """ + new_batch = [] + # get target shape + max_length = max([audio.shape[1] for audio, text in batch]) + if padding_to != -1: + if padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be greater" + " or equal to the original instance length.") + max_length = padding_to + # padding + for audio, text in batch: + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio + if flatten: + padded_audio = padded_audio.flatten() + new_batch.append((padded_audio, text)) + return new_batch + + def _batch_shuffle(self, manifest, batch_size): + """ + The instances have different lengths and they cannot be + combined into a single matrix multiplication. It usually + sorts the training examples by length and combines only + similarly-sized instances into minibatches, pads with + silence when necessary so that all instances in a batch + have the same length. This batch shuffle fuction is used + to make similarly-sized instances into minibatches and + make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly remove `k` instances in order to make different mini-batches, + then make minibatches and each minibatch size is batch_size. + 4. Shuffle the minibatches. + + :param manifest: manifest file. + :type manifest: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :return: batch shuffled mainifest. + :rtype: list + """ + manifest.sort(key=lambda x: x["duration"]) + shift_len = self._rng.randint(0, batch_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) + self._rng.shuffle(batch_manifest) + batch_manifest = list(sum(batch_manifest, ())) + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest diff --git a/data_utils/featurizer/__init__.py b/data_utils/featurizer/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py new file mode 100755 index 00000000..5d9c6883 --- /dev/null +++ b/data_utils/featurizer/audio_featurizer.py @@ -0,0 +1,86 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import random +from data_utils import utils +from data_utils.audio import AudioSegment + + +class AudioFeaturizer(object): + def __init__(self, + specgram_type='linear', + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + random_seed=0): + self._specgram_type = specgram_type + self._stride_ms = stride_ms + self._window_ms = window_ms + self._max_freq = max_freq + + def featurize(self, audio_segment): + return self._compute_specgram(audio_segment.samples, + audio_segment.sample_rate) + + def _compute_specgram(self, samples, sample_rate): + if self._specgram_type == 'linear': + return self._compute_linear_specgram( + samples, sample_rate, self._stride_ms, self._window_ms, + self._max_freq) + else: + raise ValueError("Unknown specgram_type %s. " + "Supported values: linear." % self._specgram_type) + + def _compute_linear_specgram(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + eps=1e-14): + """Laod audio data and calculate the log of spectrogram by FFT. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + specgram, freqs = self._specgram_real( + samples, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.log(specgram[:ind, :] + eps) + + def _specgram_real(self, samples, window_size, stride_size, sample_rate): + """Compute the spectrogram by FFT for a discrete real signal. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + fft = np.fft.rfft(windows * weighting, axis=0) + fft = np.absolute(fft)**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs diff --git a/data_utils/featurizer/speech_featurizer.py b/data_utils/featurizer/speech_featurizer.py new file mode 100755 index 00000000..06af7a02 --- /dev/null +++ b/data_utils/featurizer/speech_featurizer.py @@ -0,0 +1,32 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.featurizer.audio_featurizer import AudioFeaturizer +from data_utils.featurizer.text_featurizer import TextFeaturizer + + +class SpeechFeaturizer(object): + def __init__(self, + vocab_filepath, + specgram_type='linear', + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + random_seed=0): + self._audio_featurizer = AudioFeaturizer( + specgram_type, stride_ms, window_ms, max_freq, random_seed) + self._text_featurizer = TextFeaturizer(vocab_filepath) + + def featurize(self, speech_segment): + audio_feature = self._audio_featurizer.featurize(speech_segment) + text_ids = self._text_featurizer.text2ids(speech_segment.transcript) + return audio_feature, text_ids + + @property + def vocab_size(self): + return self._text_featurizer.vocab_size + + @property + def vocab_list(self): + return self._text_featurizer.vocab_list diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py new file mode 100755 index 00000000..7e4b69d7 --- /dev/null +++ b/data_utils/featurizer/text_featurizer.py @@ -0,0 +1,39 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + + +class TextFeaturizer(object): + def __init__(self, vocab_filepath): + self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( + vocab_filepath) + + def text2ids(self, text): + tokens = self._char_tokenize(text) + return [self._vocab_dict[token] for token in tokens] + + def ids2text(self, ids): + return ''.join([self._vocab_list[id] for id in ids]) + + @property + def vocab_size(self): + return len(self._vocab_list) + + @property + def vocab_list(self): + return self._vocab_list + + def _char_tokenize(self, text): + return list(text.strip()) + + def _load_vocabulary_from_file(self, vocab_filepath): + """Load vocabulary from file.""" + vocab_lines = [] + with open(vocab_filepath, 'r') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py new file mode 100755 index 00000000..364600af --- /dev/null +++ b/data_utils/normalizer.py @@ -0,0 +1,49 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import random +import data_utils.utils as utils +from data_utils.audio import AudioSegment + + +class FeatureNormalizer(object): + def __init__(self, + mean_std_filepath, + manifest_path=None, + featurize_func=None, + num_samples=500, + random_seed=0): + if not mean_std_filepath: + if not (manifest_path and featurize_func): + raise ValueError("If mean_std_filepath is None, meanifest_path " + "and featurize_func should not be None.") + self._rng = random.Random(random_seed) + self._compute_mean_std(manifest_path, featurize_func, num_samples) + else: + self._read_mean_std_from_file(mean_std_filepath) + + def apply(self, features, eps=1e-14): + """Normalize features to be of zero mean and unit stddev.""" + return (features - self._mean) / (self._std + eps) + + def write_to_file(self, filepath): + np.savez(filepath, mean=self._mean, std=self._std) + + def _read_mean_std_from_file(self, filepath): + npzfile = np.load(filepath) + self._mean = npzfile["mean"] + self._std = npzfile["std"] + + def _compute_mean_std(self, manifest_path, featurize_func, num_samples): + manifest = utils.read_manifest(manifest_path) + sampled_manifest = self._rng.sample(manifest, num_samples) + features = [] + for instance in sampled_manifest: + features.append( + featurize_func( + AudioSegment.from_file(instance["audio_filepath"]))) + features = np.hstack(features) + self._mean = np.mean(features, axis=1).reshape([-1, 1]) + self._std = np.std(features, axis=1).reshape([-1, 1]) diff --git a/data_utils/utils.py b/data_utils/utils.py new file mode 100755 index 00000000..2a916b54 --- /dev/null +++ b/data_utils/utils.py @@ -0,0 +1,19 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json + + +def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): + """Load and parse manifest file.""" + manifest = [] + for json_line in open(manifest_path): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + if (json_data["duration"] <= max_duration and + json_data["duration"] >= min_duration): + manifest.append(json_data) + return manifest diff --git a/data/librispeech.py b/datasets/librispeech/librispeech.py similarity index 99% rename from data/librispeech.py rename to datasets/librispeech/librispeech.py index 653caa92..1ba2a442 100644 --- a/data/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -44,7 +44,7 @@ parser.add_argument( help="Directory to save the dataset. (default: %(default)s)") parser.add_argument( "--manifest_prefix", - default="manifest.libri", + default="manifest", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") parser.add_argument( diff --git a/datasets/run_all.sh b/datasets/run_all.sh new file mode 100755 index 00000000..ef2b721f --- /dev/null +++ b/datasets/run_all.sh @@ -0,0 +1,13 @@ +cd librispeech +python librispeech.py +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi +cd - + +cat librispeech/manifest.train* | shuf > manifest.train +cat librispeech/manifest.dev-clean > manifest.dev +cat librispeech/manifest.test-clean > manifest.test + +echo "All done." diff --git a/data/eng_vocab.txt b/datasets/vocab/eng_vocab.txt similarity index 100% rename from data/eng_vocab.txt rename to datasets/vocab/eng_vocab.txt diff --git a/infer.py b/infer.py index 598c348b..eb31254c 100644 --- a/infer.py +++ b/infer.py @@ -2,11 +2,15 @@ Inference for a simplifed version of Baidu DeepSpeech2 model. """ -import paddle.v2 as paddle -import distutils.util +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + import argparse import gzip -from audio_data_utils import DataGenerator +import distutils.util +import paddle.v2 as paddle +from data_utils.data import DataGenerator from model import deep_speech2 from decoder import ctc_decode @@ -38,13 +42,13 @@ parser.add_argument( type=distutils.util.strtobool, help="Use gpu or not. (default: %(default)s)") parser.add_argument( - "--normalizer_manifest_path", - default='data/manifest.libri.train-clean-100', + "--mean_std_filepath", + default='mean_std.npz', type=str, help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--decode_manifest_path", - default='data/manifest.libri.test-clean', + default='datasets/manifest.test', type=str, help="Manifest path for decoding. (default: %(default)s)") parser.add_argument( @@ -54,7 +58,7 @@ parser.add_argument( help="Model filepath. (default: %(default)s)") parser.add_argument( "--vocab_filepath", - default='data/eng_vocab.txt', + default='datasets/vocab/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") args = parser.parse_args() @@ -67,28 +71,22 @@ def infer(): # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, - normalizer_manifest_path=args.normalizer_manifest_path, - normalizer_num_samples=200, - max_duration=20.0, - min_duration=0.0, - stride_ms=10, - window_ms=20) + mean_std_filepath=args.mean_std_filepath, + augmentation_config='{}') # create network config - dict_size = data_generator.vocabulary_size() - vocab_list = data_generator.vocabulary_list() + # paddle.data_type.dense_array is used for variable batch input. + # The size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be induced during training. audio_data = paddle.layer.data( - name="audio_spectrogram", - height=161, - width=2000, - type=paddle.data_type.dense_vector(322000)) + name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", - type=paddle.data_type.integer_value_sequence(dict_size)) + type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) output_probs = deep_speech2( audio_data=audio_data, text_data=text_data, - dict_size=dict_size, + dict_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_size=args.rnn_layer_size, @@ -99,31 +97,30 @@ def infer(): gzip.open(args.model_filepath)) # prepare infer data - feeding = data_generator.data_name_feeding() - test_batch_reader = data_generator.batch_reader_creator( + batch_reader = data_generator.batch_reader_creator( manifest_path=args.decode_manifest_path, batch_size=args.num_samples, - padding_to=2000, - flatten=True, - sort_by_duration=False, - shuffle=False) - infer_data = test_batch_reader().next() + sortagrad=False, + batch_shuffle=False) + infer_data = batch_reader().next() # run inference infer_results = paddle.infer( output_layer=output_probs, parameters=parameters, input=infer_data) - num_steps = len(infer_results) / len(infer_data) + num_steps = len(infer_results) // len(infer_data) probs_split = [ infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(0, len(infer_data)) + for i in xrange(len(infer_data)) ] # decode and print for i, probs in enumerate(probs_split): output_transcription = ctc_decode( - probs_seq=probs, vocabulary=vocab_list, method="best_path") + probs_seq=probs, + vocabulary=data_generator.vocab_list, + method="best_path") target_transcription = ''.join( - [vocab_list[index] for index in infer_data[i][1]]) + [data_generator.vocab_list[index] for index in infer_data[i][1]]) print("Target Transcription: %s \nOutput Transcription: %s \n" % (target_transcription, output_transcription)) diff --git a/train.py b/train.py index 957c2426..c6aa9752 100644 --- a/train.py +++ b/train.py @@ -2,21 +2,21 @@ Trainer for a simplifed version of Baidu DeepSpeech2 model. """ -import paddle.v2 as paddle -import distutils.util +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import os import argparse import gzip import time -import sys +import distutils.util +import paddle.v2 as paddle from model import deep_speech2 -from audio_data_utils import DataGenerator -import numpy as np -import os +from data_utils.data import DataGenerator -#TODO: add WER metric - -parser = argparse.ArgumentParser( - description='Simplified version of DeepSpeech2 trainer.') +parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--batch_size", default=32, type=int, help="Minibatch size.") parser.add_argument( @@ -51,7 +51,7 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--use_sortagrad", - default=False, + default=True, type=distutils.util.strtobool, help="Use sortagrad or not. (default: %(default)s)") parser.add_argument( @@ -60,23 +60,23 @@ parser.add_argument( type=int, help="Trainer number. (default: %(default)s)") parser.add_argument( - "--normalizer_manifest_path", - default='data/manifest.libri.train-clean-100', + "--mean_std_filepath", + default='mean_std.npz', type=str, help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--train_manifest_path", - default='data/manifest.libri.train-clean-100', + default='datasets/manifest.train', type=str, help="Manifest path for training. (default: %(default)s)") parser.add_argument( "--dev_manifest_path", - default='data/manifest.libri.dev-clean', + default='datasets/manifest.dev', type=str, help="Manifest path for validation. (default: %(default)s)") parser.add_argument( "--vocab_filepath", - default='data/eng_vocab.txt', + default='datasets/vocab/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") parser.add_argument( @@ -86,6 +86,12 @@ parser.add_argument( help="If set None, the training will start from scratch. " "Otherwise, the training will resume from " "the existing model of this path. (default: %(default)s)") +parser.add_argument( + "--augmentation_config", + default='{}', + type=str, + help="Augmentation configuration in json-format. " + "(default: %(default)s)") args = parser.parse_args() @@ -98,29 +104,26 @@ def train(): def data_generator(): return DataGenerator( vocab_filepath=args.vocab_filepath, - normalizer_manifest_path=args.normalizer_manifest_path, - normalizer_num_samples=200, - max_duration=20.0, - min_duration=0.0, - stride_ms=10, - window_ms=20) + mean_std_filepath=args.mean_std_filepath, + augmentation_config=args.augmentation_config) train_generator = data_generator() test_generator = data_generator() + # create network config - dict_size = train_generator.vocabulary_size() # paddle.data_type.dense_array is used for variable batch input. - # the size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be set at each batch. + # The size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be induced during training. audio_data = paddle.layer.data( name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", - type=paddle.data_type.integer_value_sequence(dict_size)) + type=paddle.data_type.integer_value_sequence( + train_generator.vocab_size)) cost = deep_speech2( audio_data=audio_data, text_data=text_data, - dict_size=dict_size, + dict_size=train_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_size=args.rnn_layer_size, @@ -143,13 +146,13 @@ def train(): train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, - sortagrad=True if args.init_model_path is None else False, + sortagrad=args.use_sortagrad if args.init_model_path is None else False, batch_shuffle=True) test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, + sortagrad=False, batch_shuffle=False) - feeding = train_generator.data_name_feeding() # create event handler def event_handler(event): @@ -158,8 +161,8 @@ def train(): cost_sum += event.cost cost_counter += 1 if event.batch_id % 50 == 0: - print "\nPass: %d, Batch: %d, TrainCost: %f" % ( - event.pass_id, event.batch_id, cost_sum / cost_counter) + print("\nPass: %d, Batch: %d, TrainCost: %f" % + (event.pass_id, event.batch_id, cost_sum / cost_counter)) cost_sum, cost_counter = 0.0, 0 with gzip.open("params.tar.gz", 'w') as f: parameters.to_tar(f) @@ -170,16 +173,17 @@ def train(): start_time = time.time() cost_sum, cost_counter = 0.0, 0 if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_batch_reader, feeding=feeding) - print "\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % ( - time.time() - start_time, event.pass_id, result.cost) + result = trainer.test( + reader=test_batch_reader, feeding=test_generator.feeding) + print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % + (time.time() - start_time, event.pass_id, result.cost)) # run train trainer.train( reader=train_batch_reader, event_handler=event_handler, num_passes=args.num_passes, - feeding=feeding) + feeding=train_generator.feeding) def main(): From bc3224eb140082d6ba286accebe85c1d019e6e8f Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 12 Jun 2017 12:51:01 +0800 Subject: [PATCH 20/55] Follow comments. --- error_rate.py | 94 ++++++++++++++++++++++++--------------------------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/error_rate.py b/error_rate.py index f216177e..2bb63711 100644 --- a/error_rate.py +++ b/error_rate.py @@ -1,4 +1,9 @@ -# -- * -- coding: utf-8 -- * -- +# -*- coding: utf-8 -*- +""" + This module provides functions to calculate error rate in different level. + e.g. wer for word-level, cer for char-level. +""" + import numpy as np @@ -14,9 +19,9 @@ def levenshtein_distance(ref, hyp): if hyp_len == 0: return ref_len - distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int64) + distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32) - # initialization distance matrix + # initialize distance matrix for j in xrange(hyp_len + 1): distance[0][j] = j for i in xrange(ref_len + 1): @@ -36,11 +41,10 @@ def levenshtein_distance(ref, hyp): return distance[ref_len][hyp_len] -def wer(reference, hypophysis, delimiter=' ', filter_none=True): +def wer(reference, hypothesis, ignore_case=False, delimiter=' '): """ - Calculate word error rate (WER). WER is a popular evaluation metric used - in speech recognition. It compares a reference with an hypophysis and - is defined like this: + Calculate word error rate (WER). WER compares reference text and + hypothesis text in word-level. WER is defined as: .. math:: WER = (Sw + Dw + Iw) / Nw @@ -54,41 +58,39 @@ def wer(reference, hypophysis, delimiter=' ', filter_none=True): Iw is the number of words inserted, Nw is the number of words in the reference - We can use levenshtein distance to calculate WER. Please draw an attention - that this function will truncate the beginning and ending delimiter for - reference and hypophysis sentences before calculating WER. + We can use levenshtein distance to calculate WER. Please draw an attention that + empty items will be removed when splitting sentences by delimiter. :param reference: The reference sentence. - :type reference: str - :param hypophysis: The hypophysis sentence. - :type reference: str + :type reference: basestring + :param hypothesis: The hypothesis sentence. + :type hypothesis: basestring + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool :param delimiter: Delimiter of input sentences. :type delimiter: char - :param filter_none: Whether to remove None value when splitting sentence. - :type filter_none: bool - :return: WER + :return: Word error rate. :rtype: float """ + if ignore_case == True: + reference = reference.lower() + hypothesis = hypothesis.lower() - if len(reference.strip(delimiter)) == 0: - raise ValueError("Reference's word number should be greater than 0.") + ref_words = filter(None, reference.split(delimiter)) + hyp_words = filter(None, hypothesis.split(delimiter)) - if filter_none == True: - ref_words = filter(None, reference.strip(delimiter).split(delimiter)) - hyp_words = filter(None, hypophysis.strip(delimiter).split(delimiter)) - else: - ref_words = reference.strip(delimiter).split(delimiter) - hyp_words = reference.strip(delimiter).split(delimiter) + if len(ref_words) == 0: + raise ValueError("Reference's word number should be greater than 0.") edit_distance = levenshtein_distance(ref_words, hyp_words) wer = float(edit_distance) / len(ref_words) return wer -def cer(reference, hypophysis, squeeze=True, ignore_case=False, strip_char=''): +def cer(reference, hypothesis, ignore_case=False): """ - Calculate charactor error rate (CER). CER will compare reference text and - hypophysis text in char-level. CER is defined as: + Calculate charactor error rate (CER). CER compares reference text and + hypothesis text in char-level. CER is defined as: .. math:: CER = (Sc + Dc + Ic) / Nc @@ -97,41 +99,35 @@ def cer(reference, hypophysis, squeeze=True, ignore_case=False, strip_char=''): .. code-block:: text - Sc is the number of character substituted, - Dc is the number of deleted, - Ic is the number of inserted + Sc is the number of characters substituted, + Dc is the number of characters deleted, + Ic is the number of characters inserted Nc is the number of characters in the reference We can use levenshtein distance to calculate CER. Chinese input should be - encoded to unicode. + encoded to unicode. Please draw an attention that the leading and tailing + white space characters will be truncated and multiple consecutive white + space characters in a sentence will be replaced by one white space character. :param reference: The reference sentence. - :type reference: str - :param hypophysis: The hypophysis sentence. - :type reference: str - :param squeeze: If set true, consecutive space character - will be squeezed to one - :type squeeze: bool + :type reference: basestring + :param hypothesis: The hypothesis sentence. + :type hypothesis: basestring :param ignore_case: Whether case-sensitive or not. :type ignore_case: bool - :param strip_char: If not set to '', strip_char in beginning and ending of - sentence will be truncated. - :type strip_char: char - :return: CER + :return: Character error rate. :rtype: float """ if ignore_case == True: reference = reference.lower() - hypophysis = hypophysis.lower() - if strip_char != '': - reference = reference.strip(strip_char) - hypophysis = hypophysis.strip(strip_char) - if squeeze == True: - reference = ' '.join(filter(None, reference.split(' '))) - hypophysis = ' '.join(filter(None, hypophysis.split(' '))) + hypothesis = hypothesis.lower() + + reference = ' '.join(filter(None, reference.split(' '))) + hypothesis = ' '.join(filter(None, hypothesis.split(' '))) if len(reference) == 0: raise ValueError("Length of reference should be greater than 0.") - edit_distance = levenshtein_distance(reference, hypophysis) + + edit_distance = levenshtein_distance(reference, hypothesis) cer = float(edit_distance) / len(reference) return cer From b07ee84a1d613511193a486363937750880ea6fa Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 13 Jun 2017 23:16:07 +0800 Subject: [PATCH 21/55] Add function, class and module docs for data parts in DS2. --- compute_mean_std.py | 3 +- data_utils/audio.py | 232 ++++++++++++++++++--- data_utils/augmentor/augmentation.py | 60 +++++- data_utils/augmentor/base.py | 16 ++ data_utils/augmentor/volume_perturb.py | 40 ++++ data_utils/augmentor/volumn_perturb.py | 17 -- data_utils/data.py | 166 +++++++-------- data_utils/featurizer/audio_featurizer.py | 38 +++- data_utils/featurizer/speech_featurizer.py | 55 ++++- data_utils/featurizer/text_featurizer.py | 36 +++- data_utils/normalizer.py | 40 +++- data_utils/speech.py | 75 +++++++ data_utils/utils.py | 17 +- datasets/librispeech/librispeech.py | 16 +- decoder.py | 9 +- infer.py | 5 +- model.py | 9 +- train.py | 7 +- 18 files changed, 662 insertions(+), 179 deletions(-) create mode 100755 data_utils/augmentor/volume_perturb.py delete mode 100755 data_utils/augmentor/volumn_perturb.py create mode 100755 data_utils/speech.py diff --git a/compute_mean_std.py b/compute_mean_std.py index b3015df7..9c301c93 100755 --- a/compute_mean_std.py +++ b/compute_mean_std.py @@ -1,3 +1,4 @@ +"""Compute mean and std for feature normalizer, and save to file.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -17,7 +18,7 @@ parser.add_argument( "(default: %(default)s)") parser.add_argument( "--num_samples", - default=500, + default=2000, type=int, help="Number of samples for computing mean and stddev. " "(default: %(default)s)") diff --git a/data_utils/audio.py b/data_utils/audio.py index 46b24120..916c8ac1 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -1,3 +1,8 @@ +"""Contains the audio segment class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + import numpy as np import io import soundfile @@ -5,64 +10,243 @@ import soundfile class AudioSegment(object): """Monaural audio segment abstraction. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. """ def __init__(self, samples, sample_rate): - if not samples.dtype == np.float32: - raise ValueError("Sample data type of [%s] is not supported.") - self._samples = samples + """Create audio segment from samples. + + Samples are convert float32 internally, with int scaled to [-1, 1]. + """ + self._samples = self._convert_samples_to_float32(samples) self._sample_rate = sample_rate if self._samples.ndim >= 2: self._samples = np.mean(self._samples, 1) + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: + return False + if self._samples.shape != other._samples.shape: + return False + if np.any(self.samples != other._samples): + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + @classmethod - def from_file(cls, filepath): - samples, sample_rate = soundfile.read(filepath, dtype='float32') + def from_file(cls, file): + """Create audio segment from audio file. + + :param filepath: Filepath or file object to audio file. + :type filepath: basestring|file + :return: Audio segment instance. + :rtype: AudioSegment + """ + samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) @classmethod def from_bytes(cls, bytes): + """Create audio segment from a byte string containing audio samples. + + :param bytes: Byte string containing audio samples. + :type bytes: str + :return: Audio segment instance. + :rtype: AudioSegment + """ samples, sample_rate = soundfile.read( io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) + def to_wav_file(self, filepath, dtype='float32'): + """Save audio segment to disk as wav file. + + :param filepath: WAV filepath or file object to save the + audio segment. + :type filepath: basestring|file + :param dtype: Subtype for audio file. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :raises TypeError: If dtype is not supported. + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + subtype_map = { + 'int16': 'PCM_16', + 'int32': 'PCM_32', + 'float32': 'FLOAT', + 'float64': 'DOUBLE' + } + soundfile.write( + filepath, + samples, + self._sample_rate, + format='WAV', + subtype=subtype_map[dtype]) + + def to_bytes(self, dtype='float32'): + """Create a byte string containing the audio content. + + :param dtype: Data type for export samples. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :return: Byte string containing audio content. + :rtype: str + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + return samples.tostring() + def apply_gain(self, gain): - self.samples *= 10.**(gain / 20.) + """Apply gain in decibels to samples. + + Note that this is an in-place transformation. + + :param gain: Gain in decibels to apply to samples. + :type gain: float + """ + self._samples *= 10.**(gain / 20.) + + def change_speed(self, speed_rate): + """Change the audio speed by linear interpolation. + + Note that this is an in-place transformation. + + :param speed_rate: Rate of speed change: + speed_rate > 1.0, speed up the audio; + speed_rate = 1.0, unchanged; + speed_rate < 1.0, slow down the audio; + speed_rate <= 0.0, not allowed, raise ValueError. + :type speed_rate: float + :raises ValueError: If speed_rate <= 0.0. + """ + if speed_rate <= 0: + raise ValueError("speed_rate should be greater than zero.") + old_length = self._samples.shape[0] + new_length = int(old_length / speed_rate) + old_indices = np.arange(old_length) + new_indices = np.linspace(start=0, stop=old_length, num=new_length) + self._samples = np.interp(new_indices, old_indices, self._samples) + + def normalize(self, target_sample_rate): + raise NotImplementedError() def resample(self, target_sample_rate): raise NotImplementedError() - def change_speed(self, rate): + def pad_silence(self, duration, sides='both'): + raise NotImplementedError() + + def subsegment(self, start_sec=None, end_sec=None): + raise NotImplementedError() + + def convolve(self, filter, allow_resample=False): + raise NotImplementedError() + + def convolve_and_normalize(self, filter, allow_resample=False): raise NotImplementedError() @property def samples(self): + """Return audio samples. + + :return: Audio samples. + :rtype: ndarray + """ return self._samples.copy() @property def sample_rate(self): + """Return audio sample rate. + + :return: Audio sample rate. + :rtype: int + """ return self._sample_rate @property - def duration(self): - return self._samples.shape[0] / float(self._sample_rate) - + def num_samples(self): + """Return number of samples. -class SpeechSegment(AudioSegment): - def __init__(self, samples, sample_rate, transcript): - AudioSegment.__init__(self, samples, sample_rate) - self._transcript = transcript + :return: Number of samples. + :rtype: int + """ + return self._samples.shape(0) - @classmethod - def from_file(cls, filepath, transcript): - audio = AudioSegment.from_file(filepath) - return cls(audio.samples, audio.sample_rate, transcript) + @property + def duration(self): + """Return audio duration. - @classmethod - def from_bytes(cls, bytes, transcript): - audio = AudioSegment.from_bytes(bytes) - return cls(audio.samples, audio.sample_rate, transcript) + :return: Audio duration in seconds. + :rtype: float + """ + return self._samples.shape[0] / float(self._sample_rate) @property - def transcript(self): - return self._transcript + def rms_db(self): + """Return root mean square energy of the audio in decibels. + + :return: Root mean square energy in decibels. + :rtype: float + """ + # square root => multiply by 10 instead of 20 for dBs + mean_square = np.mean(self._samples**2) + return 10 * np.log10(mean_square) + + def _convert_samples_to_float32(self, samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + def _convert_samples_from_float32(self, samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + This is for writing a audio file. + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index 3a1426a1..abe1a0ec 100755 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -1,38 +1,80 @@ +"""Contains the data augmentation pipeline.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import json import random -from data_utils.augmentor.volumn_perturb import VolumnPerturbAugmentor +from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor class AugmentationPipeline(object): + """Build a pre-processing pipeline with various augmentation models.Such a + data augmentation pipeline is oftern leveraged to augment the training + samples to make the model invariant to certain types of perturbations in the + real world, improving model's generalization ability. + + The pipeline is built according the the augmentation configuration in json + string, e.g. + + .. code-block:: + + '[{"type": "volume", + "params": {"min_gain_dBFS": -15, + "max_gain_dBFS": 15}, + "prob": 0.5}, + {"type": "speed", + "params": {"min_speed_rate": 0.8, + "max_speed_rate": 1.2}, + "prob": 0.5} + ]' + + This augmentation configuration inserts two augmentation models + into the pipeline, with one is VolumePerturbAugmentor and the other + SpeedPerturbAugmentor. "prob" indicates the probability of the current + augmentor to take effect. + + :param augmentation_config: Augmentation configuration in json string. + :type augmentation_config: str + :param random_seed: Random seed. + :type random_seed: int + :raises ValueError: If the augmentation json config is in incorrect format". + """ + def __init__(self, augmentation_config, random_seed=0): self._rng = random.Random(random_seed) self._augmentors, self._rates = self._parse_pipeline_from( augmentation_config) def transform_audio(self, audio_segment): + """Run the pre-processing pipeline for data augmentation. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to process. + :type audio_segment: AudioSegmenet|SpeechSegment + """ for augmentor, rate in zip(self._augmentors, self._rates): if self._rng.uniform(0., 1.) <= rate: augmentor.transform_audio(audio_segment) def _parse_pipeline_from(self, config_json): + """Parse the config json to build a augmentation pipelien.""" try: configs = json.loads(config_json) + augmentors = [ + self._get_augmentor(config["type"], config["params"]) + for config in configs + ] + rates = [config["prob"] for config in configs] except Exception as e: - raise ValueError("Augmentation config json format error: " + raise ValueError("Failed to parse the augmentation config json: " "%s" % str(e)) - augmentors = [ - self._get_augmentor(config["type"], config["params"]) - for config in configs - ] - rates = [config["rate"] for config in configs] return augmentors, rates def _get_augmentor(self, augmentor_type, params): - if augmentor_type == "volumn": - return VolumnPerturbAugmentor(self._rng, **params) + """Return an augmentation model by the type name, and pass in params.""" + if augmentor_type == "volume": + return VolumePerturbAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/base.py b/data_utils/augmentor/base.py index e801b9b1..a323165a 100755 --- a/data_utils/augmentor/base.py +++ b/data_utils/augmentor/base.py @@ -1,3 +1,4 @@ +"""Contains the abstract base class for augmentation models.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -6,6 +7,11 @@ from abc import ABCMeta, abstractmethod class AugmentorBase(object): + """Abstract base class for augmentation model (augmentor) class. + All augmentor classes should inherit from this class, and implement the + following abstract methods. + """ + __metaclass__ = ABCMeta @abstractmethod @@ -14,4 +20,14 @@ class AugmentorBase(object): @abstractmethod def transform_audio(self, audio_segment): + """Adds various effects to the input audio segment. Such effects + will augment the training data to make the model invariant to certain + types of perturbations in the real world, improving model's + generalization ability. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ pass diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py new file mode 100755 index 00000000..a5a9f6ca --- /dev/null +++ b/data_utils/augmentor/volume_perturb.py @@ -0,0 +1,40 @@ +"""Contains the volume perturb augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class VolumePerturbAugmentor(AugmentorBase): + """Augmentation model for adding random volume perturbation. + + This is used for multi-loudness training of PCEN. See + + https://arxiv.org/pdf/1607.05666v1.pdf + + for more details. + + :param rng: Random generator object. + :type rng: random.Random + :param min_gain_dBFS: Minimal gain in dBFS. + :type min_gain_dBFS: float + :param max_gain_dBFS: Maximal gain in dBFS. + :type max_gain_dBFS: float + """ + + def __init__(self, rng, min_gain_dBFS, max_gain_dBFS): + self._min_gain_dBFS = min_gain_dBFS + self._max_gain_dBFS = max_gain_dBFS + self._rng = rng + + def transform_audio(self, audio_segment): + """Change audio loadness. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS) + audio_segment.apply_gain(gain) diff --git a/data_utils/augmentor/volumn_perturb.py b/data_utils/augmentor/volumn_perturb.py deleted file mode 100755 index dd1ba53a..00000000 --- a/data_utils/augmentor/volumn_perturb.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import random -from data_utils.augmentor.base import AugmentorBase - - -class VolumnPerturbAugmentor(AugmentorBase): - def __init__(self, rng, min_gain_dBFS, max_gain_dBFS): - self._min_gain_dBFS = min_gain_dBFS - self._max_gain_dBFS = max_gain_dBFS - self._rng = rng - - def transform_audio(self, audio_segment): - gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS) - audio_segment.apply_gain(gain) diff --git a/data_utils/data.py b/data_utils/data.py index 63000793..48e03fe8 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -1,8 +1,6 @@ +"""Contains data generator for orgnaizing various audio data preprocessing +pipeline and offering data reader interface of PaddlePaddle requirements. """ - Providing basic audio data preprocessing pipeline, and offering - both instance-level and batch-level data reader interfaces. -""" - from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -13,42 +11,41 @@ import paddle.v2 as paddle from data_utils import utils from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.speech_featurizer import SpeechFeaturizer -from data_utils.audio import SpeechSegment +from data_utils.speech import SpeechSegment from data_utils.normalizer import FeatureNormalizer class DataGenerator(object): """ DataGenerator provides basic audio data preprocessing pipeline, and offers - both instance-level and batch-level data reader interfaces. - Normalized FFT are used as audio features here. + data reader interfaces of PaddlePaddle requirements. - :param vocab_filepath: Vocabulary file path for indexing tokenized - transcriptions. + :param vocab_filepath: Vocabulary filepath for indexing tokenized + transcripts. :type vocab_filepath: basestring - :param normalizer_manifest_path: Manifest filepath for collecting feature - normalization statistics, e.g. mean, std. - :type normalizer_manifest_path: basestring - :param normalizer_num_samples: Number of instances sampled for collecting - feature normalization statistics. - Default is 100. - :type normalizer_num_samples: int - :param max_duration: Audio clips with duration (in seconds) greater than - this will be discarded. Default is 20.0. + :param mean_std_filepath: File containing the pre-computed mean and stddev. + :type mean_std_filepath: None|basestring + :param augmentation_config: Augmentation configuration in json string. + Details see AugmentationPipeline.__doc__. + :type augmentation_config: str + :param max_duration: Audio with duration (in seconds) greater than + this will be discarded. :type max_duration: float - :param min_duration: Audio clips with duration (in seconds) smaller than - this will be discarded. Default is 0.0. + :param min_duration: Audio with duration (in seconds) smaller than + this will be discarded. :type min_duration: float :param stride_ms: Striding size (in milliseconds) for generating frames. - Default is 10.0. :type stride_ms: float - :param window_ms: Window size (in milliseconds) for frames. Default is 20.0. + :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_frequency: Maximun frequency for FFT features. FFT features of - frequency larger than this will be discarded. - If set None, all features will be kept. - Default is None. - :type max_frequency: float + :param max_freq: Used when specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned. + :types max_freq: None|float + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param random_seed: Random seed. + :type random_seed: int """ def __init__(self, @@ -60,6 +57,7 @@ class DataGenerator(object): stride_ms=10.0, window_ms=20.0, max_freq=None, + specgram_type='linear', random_seed=0): self._max_duration = max_duration self._min_duration = min_duration @@ -68,46 +66,49 @@ class DataGenerator(object): augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, + specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, - max_freq=max_freq, - random_seed=random_seed) + max_freq=max_freq) self._rng = random.Random(random_seed) self._epoch = 0 def batch_reader_creator(self, manifest_path, batch_size, + min_batch_size=1, padding_to=-1, flatten=False, sortagrad=False, batch_shuffle=False): """ - Batch data reader creator for audio data. Creat a callable function to - produce batches of data. + Batch data reader creator for audio data. Return a callable generator + function to produce batches of data. - Audio features will be padded with zeros to make each instance in the - batch to share the same audio feature shape. + Audio features within one batch will be padded with zeros to have the + same shape, or a user-defined shape. - :param manifest_path: Filepath of manifest for audio clip files. + :param manifest_path: Filepath of manifest for audio files. :type manifest_path: basestring - :param batch_size: Instance number in a batch. + :param batch_size: Number of instances in a batch. :type batch_size: int - :param padding_to: If set -1, the maximun column numbers in the batch - will be used as the target size for padding. - Otherwise, `padding_to` will be the target size. - Default is -1. + :param min_batch_size: Any batch with batch size smaller than this will + be discarded. (To be deprecated in the future.) + :type min_batch_size: int + :param padding_to: If set -1, the maximun shape in the batch + will be used as the target shape for padding. + Otherwise, `padding_to` will be the target shape. :type padding_to: int - :param flatten: If set True, audio data will be flatten to be a 1-dim - ndarray. Otherwise, 2-dim ndarray. Default is False. + :param flatten: If set True, audio features will be flatten to 1darray. :type flatten: bool - :param sortagrad: Sort the audio clips by duration in the first epoc - if set True. + :param sortagrad: If set True, sort the instances by audio duration + in the first epoch for speed up training. :type sortagrad: bool - :param batch_shuffle: Shuffle the audio clips if set True. It is - not a thorough instance-wise shuffle, but a - specific batch-wise shuffle. For more details, - please see `_batch_shuffle` function. + :param batch_shuffle: If set True, instances are batch-wise shuffled. + For more details, please see + ``_batch_shuffle.__doc__``. + If sortagrad is True, batch_shuffle is disabled + for the first epoch. :type batch_shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable @@ -132,7 +133,7 @@ class DataGenerator(object): if len(batch) == batch_size: yield self._padding_batch(batch, padding_to, flatten) batch = [] - if len(batch) > 0: + if len(batch) >= min_batch_size: yield self._padding_batch(batch, padding_to, flatten) self._epoch += 1 @@ -140,20 +141,33 @@ class DataGenerator(object): @property def feeding(self): - """Returns data_reader's feeding dict.""" + """Returns data reader's feeding dict. + + :return: Data feeding dict. + :rtype: dict + """ return {"audio_spectrogram": 0, "transcript_text": 1} @property def vocab_size(self): - """Returns vocabulary size.""" + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ return self._speech_featurizer.vocab_size @property def vocab_list(self): - """Returns vocabulary list.""" + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ return self._speech_featurizer.vocab_list def _process_utterance(self, filename, transcript): + """Load, augment, featurize and normalize for speech data.""" speech_segment = SpeechSegment.from_file(filename, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, text_ids = self._speech_featurizer.featurize(speech_segment) @@ -162,16 +176,11 @@ class DataGenerator(object): def _instance_reader_creator(self, manifest): """ - Instance reader creator for audio data. Creat a callable function to - produce instances of data. + Instance reader creator. Create a callable function to produce + instances of data. - Instance: a tuple of a numpy ndarray of audio spectrogram and a list of - tokenized and indexed transcription text. - - :param manifest: Filepath of manifest for audio clip files. - :type manifest: basestring - :return: Data reader function. - :rtype: callable + Instance: a tuple of ndarray of audio spectrogram and a list of + token indices for transcript. """ def reader(): @@ -183,24 +192,22 @@ class DataGenerator(object): def _padding_batch(self, batch, padding_to=-1, flatten=False): """ - Padding audio part of features (only in the time axis -- column axis) - with zeros, to make each instance in the batch share the same - audio feature shape. + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. - If `padding_to` is set -1, the maximun column numbers in the batch will - be used as the target size. Otherwise, `padding_to` will be the target - size. Default is -1. + If ``padding_to`` is -1, the maximun shape in the batch will be used + as the target shape for padding. Otherwise, `padding_to` will be the + target shape (only refers to the second axis). - If `flatten` is set True, audio data will be flatten to be a 1-dim - ndarray. Default is False. + If `flatten` is True, features will be flatten to 1darray. """ new_batch = [] # get target shape max_length = max([audio.shape[1] for audio, text in batch]) if padding_to != -1: if padding_to < max_length: - raise ValueError("If padding_to is not -1, it should be greater" - " or equal to the original instance length.") + raise ValueError("If padding_to is not -1, it should be larger " + "than any instance's shape in the batch") max_length = padding_to # padding for audio, text in batch: @@ -212,28 +219,21 @@ class DataGenerator(object): return new_batch def _batch_shuffle(self, manifest, batch_size): - """ - The instances have different lengths and they cannot be - combined into a single matrix multiplication. It usually - sorts the training examples by length and combines only - similarly-sized instances into minibatches, pads with - silence when necessary so that all instances in a batch - have the same length. This batch shuffle fuction is used - to make similarly-sized instances into minibatches and - make a batch-wise shuffle. + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. 1. Sort the audio clips by duration. 2. Generate a random number `k`, k in [0, batch_size). - 3. Randomly remove `k` instances in order to make different mini-batches, - then make minibatches and each minibatch size is batch_size. + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. 4. Shuffle the minibatches. - :param manifest: manifest file. + :param manifest: Manifest contents. List of dict. :type manifest: list :param batch_size: Batch size. This size is also used for generate a random number for batch shuffle. :type batch_size: int - :return: batch shuffled mainifest. + :return: Batch shuffled mainifest. :rtype: list """ manifest.sort(key=lambda x: x["duration"]) diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 5d9c6883..9f9d4e50 100755 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -1,30 +1,54 @@ +"""Contains the audio featurizer class.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np -import random from data_utils import utils from data_utils.audio import AudioSegment class AudioFeaturizer(object): + """Audio featurizer, for extracting features from audio contents of + AudioSegment or SpeechSegment. + + Currently, it only supports feature type of linear spectrogram. + + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: Used when specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned. + :types max_freq: None|float + """ + def __init__(self, specgram_type='linear', stride_ms=10.0, window_ms=20.0, - max_freq=None, - random_seed=0): + max_freq=None): self._specgram_type = specgram_type self._stride_ms = stride_ms self._window_ms = window_ms self._max_freq = max_freq def featurize(self, audio_segment): + """Extract audio features from AudioSegment or SpeechSegment. + + :param audio_segment: Audio/speech segment to extract features from. + :type audio_segment: AudioSegment|SpeechSegment + :return: Spectrogram audio feature in 2darray. + :rtype: ndarray + """ return self._compute_specgram(audio_segment.samples, audio_segment.sample_rate) def _compute_specgram(self, samples, sample_rate): + """Extract various audio features.""" if self._specgram_type == 'linear': return self._compute_linear_specgram( samples, sample_rate, self._stride_ms, self._window_ms, @@ -40,9 +64,7 @@ class AudioFeaturizer(object): window_ms=20.0, max_freq=None, eps=1e-14): - """Laod audio data and calculate the log of spectrogram by FFT. - Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech - """ + """Compute the linear spectrogram from FFT energy.""" if max_freq is None: max_freq = sample_rate / 2 if max_freq > sample_rate / 2: @@ -62,9 +84,7 @@ class AudioFeaturizer(object): return np.log(specgram[:ind, :] + eps) def _specgram_real(self, samples, window_size, stride_size, sample_rate): - """Compute the spectrogram by FFT for a discrete real signal. - Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech - """ + """Compute the spectrogram for samples from a real signal.""" # extract strided windows truncate_size = (len(samples) - window_size) % stride_size samples = samples[:len(samples) - truncate_size] diff --git a/data_utils/featurizer/speech_featurizer.py b/data_utils/featurizer/speech_featurizer.py index 06af7a02..77020455 100755 --- a/data_utils/featurizer/speech_featurizer.py +++ b/data_utils/featurizer/speech_featurizer.py @@ -1,3 +1,4 @@ +"""Contains the speech featurizer class.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -7,26 +8,70 @@ from data_utils.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(object): + """Speech featurizer, for extracting features from both audio and transcript + contents of SpeechSegment. + + Currently, for audio parts, it only supports feature type of linear + spectrogram; for transcript parts, it only supports char-level tokenizing + and conversion into a list of token indices. Note that the token indexing + order follows the given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: basestring + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: Used when specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned. + :types max_freq: None|float + """ + def __init__(self, vocab_filepath, specgram_type='linear', stride_ms=10.0, window_ms=20.0, - max_freq=None, - random_seed=0): - self._audio_featurizer = AudioFeaturizer( - specgram_type, stride_ms, window_ms, max_freq, random_seed) + max_freq=None): + self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms, + window_ms, max_freq) self._text_featurizer = TextFeaturizer(vocab_filepath) def featurize(self, speech_segment): + """Extract features for speech segment. + + 1. For audio parts, extract the audio features. + 2. For transcript parts, convert text string to a list of token indices + in char-level. + + :param audio_segment: Speech segment to extract features from. + :type audio_segment: SpeechSegment + :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of + char-level token indices. + :rtype: tuple + """ audio_feature = self._audio_featurizer.featurize(speech_segment) - text_ids = self._text_featurizer.text2ids(speech_segment.transcript) + text_ids = self._text_featurizer.featurize(speech_segment.transcript) return audio_feature, text_ids @property def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ return self._text_featurizer.vocab_size @property def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ return self._text_featurizer.vocab_list diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py index 7e4b69d7..4f9a49b5 100755 --- a/data_utils/featurizer/text_featurizer.py +++ b/data_utils/featurizer/text_featurizer.py @@ -1,3 +1,4 @@ +"""Contains the text featurizer class.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -6,26 +7,53 @@ import os class TextFeaturizer(object): + """Text featurizer, for processing or extracting features from text. + + Currently, it only supports char-level tokenizing and conversion into + a list of token indices. Note that the token indexing order follows the + given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: basestring + """ + def __init__(self, vocab_filepath): self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( vocab_filepath) - def text2ids(self, text): + def featurize(self, text): + """Convert text string to a list of token indices in char-level.Note + that the token indexing order follows the given vocabulary file. + + :param text: Text to process. + :type text: basestring + :return: List of char-level token indices. + :rtype: list + """ tokens = self._char_tokenize(text) return [self._vocab_dict[token] for token in tokens] - def ids2text(self, ids): - return ''.join([self._vocab_list[id] for id in ids]) - @property def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ return len(self._vocab_list) @property def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ return self._vocab_list def _char_tokenize(self, text): + """Character tokenizer.""" return list(text.strip()) def _load_vocabulary_from_file(self, vocab_filepath): diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py index 364600af..c123d25d 100755 --- a/data_utils/normalizer.py +++ b/data_utils/normalizer.py @@ -1,3 +1,4 @@ +"""Contains feature normalizers.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -9,6 +10,28 @@ from data_utils.audio import AudioSegment class FeatureNormalizer(object): + """Feature normalizer. Normalize features to be of zero mean and unit + stddev. + + if mean_std_filepath is provided (not None), the normalizer will directly + initilize from the file. Otherwise, both manifest_path and featurize_func + should be given for on-the-fly mean and stddev computing. + + :param mean_std_filepath: File containing the pre-computed mean and stddev. + :type mean_std_filepath: None|basestring + :param manifest_path: Manifest of instances for computing mean and stddev. + :type meanifest_path: None|basestring + :param featurize_func: Function to extract features. It should be callable + with ``featurize_func(audio_segment)``. + :type featurize_func: None|callable + :param num_samples: Number of random samples for computing mean and stddev. + :type num_samples: int + :param random_seed: Random seed for sampling instances. + :type random_seed: int + :raises ValueError: If both mean_std_filepath and manifest_path + (or both mean_std_filepath and featurize_func) are None. + """ + def __init__(self, mean_std_filepath, manifest_path=None, @@ -25,18 +48,33 @@ class FeatureNormalizer(object): self._read_mean_std_from_file(mean_std_filepath) def apply(self, features, eps=1e-14): - """Normalize features to be of zero mean and unit stddev.""" + """Normalize features to be of zero mean and unit stddev. + + :param features: Input features to be normalized. + :type features: ndarray + :param eps: added to stddev to provide numerical stablibity. + :type eps: float + :return: Normalized features. + :rtype: ndarray + """ return (features - self._mean) / (self._std + eps) def write_to_file(self, filepath): + """Write the mean and stddev to the file. + + :param filepath: File to write mean and stddev. + :type filepath: basestring + """ np.savez(filepath, mean=self._mean, std=self._std) def _read_mean_std_from_file(self, filepath): + """Load mean and std from file.""" npzfile = np.load(filepath) self._mean = npzfile["mean"] self._std = npzfile["std"] def _compute_mean_std(self, manifest_path, featurize_func, num_samples): + """Compute mean and std from randomly sampled instances.""" manifest = utils.read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] diff --git a/data_utils/speech.py b/data_utils/speech.py new file mode 100755 index 00000000..48db595b --- /dev/null +++ b/data_utils/speech.py @@ -0,0 +1,75 @@ +"""Contains the speech segment class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.audio import AudioSegment + + +class SpeechSegment(AudioSegment): + """Speech segment abstraction, a subclass of AudioSegment, + with an additional transcript. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :param transcript: Transcript text for the speech. + :type transript: basestring + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate, transcript): + AudioSegment.__init__(self, samples, sample_rate) + self._transcript = transcript + + def __eq__(self, other): + """Return whether two objects are equal. + """ + if not AudioSegment.__eq__(self, other): + return False + if self._transcript != other._transcript: + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + @classmethod + def from_file(cls, filepath, transcript): + """Create speech segment from audio file and corresponding transcript. + + :param filepath: Filepath or file object to audio file. + :type filepath: basestring|file + :param transcript: Transcript text for the speech. + :type transript: basestring + :return: Audio segment instance. + :rtype: AudioSegment + """ + audio = AudioSegment.from_file(filepath) + return cls(audio.samples, audio.sample_rate, transcript) + + @classmethod + def from_bytes(cls, bytes, transcript): + """Create speech segment from a byte string and corresponding + transcript. + + :param bytes: Byte string containing audio samples. + :type bytes: str + :param transcript: Transcript text for the speech. + :type transript: basestring + :return: Audio segment instance. + :rtype: AudioSegment + """ + audio = AudioSegment.from_bytes(bytes) + return cls(audio.samples, audio.sample_rate, transcript) + + @property + def transcript(self): + """Return the transcript text. + + :return: Transcript text for the speech. + :rtype: basestring + """ + return self._transcript diff --git a/data_utils/utils.py b/data_utils/utils.py index 2a916b54..3f116571 100755 --- a/data_utils/utils.py +++ b/data_utils/utils.py @@ -1,3 +1,4 @@ +"""Contains data helper functions.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -6,7 +7,21 @@ import json def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): - """Load and parse manifest file.""" + """Load and parse manifest file. + + Instances with durations outside [min_duration, max_duration] will be + filtered out. + + :param manifest_path: Manifest file to load and parse. + :type manifest_path: basestring + :param max_duration: Maximal duration in seconds for instance filter. + :type max_duration: float + :param min_duration: Minimal duration in seconds for instance filter. + :type min_duration: float + :return: Manifest parsing results. List of dict. + :rtype: list + :raises IOError: If failed to parse the manifest. + """ manifest = [] for json_line in open(manifest_path): try: diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py index 1ba2a442..faf038cc 100644 --- a/datasets/librispeech/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -1,13 +1,14 @@ -""" - Download, unpack and create manifest json files for the Librespeech dataset. +"""Prepare Librispeech ASR datasets. - A manifest is a json file summarizing filelist in a data set, with each line - containing the meta data (i.e. audio filepath, transcription text, audio - duration) of each audio file in the data set. +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. """ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function -import paddle.v2 as paddle -from paddle.v2.dataset.common import md5file import distutils.util import os import wget @@ -15,6 +16,7 @@ import tarfile import argparse import soundfile import json +from paddle.v2.dataset.common import md5file DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/decoder.py b/decoder.py index 7c4b9526..8314885c 100755 --- a/decoder.py +++ b/decoder.py @@ -1,9 +1,10 @@ -""" - CTC-like decoder utilitis. -""" +"""Contains various CTC decoder.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function -from itertools import groupby import numpy as np +from itertools import groupby def ctc_best_path_decode(probs_seq, vocabulary): diff --git a/infer.py b/infer.py index eb31254c..f7c99df1 100644 --- a/infer.py +++ b/infer.py @@ -1,7 +1,4 @@ -""" - Inference for a simplifed version of Baidu DeepSpeech2 model. -""" - +"""Inferer for DeepSpeech2 model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function diff --git a/model.py b/model.py index 13ff829b..cb0b4ecb 100644 --- a/model.py +++ b/model.py @@ -1,11 +1,10 @@ -""" - A simplifed version of Baidu DeepSpeech2 model. -""" +"""Contains DeepSpeech2 model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import paddle.v2 as paddle -#TODO: add bidirectional rnn. - def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding, act): diff --git a/train.py b/train.py index c6aa9752..7ac4626f 100644 --- a/train.py +++ b/train.py @@ -1,7 +1,4 @@ -""" - Trainer for a simplifed version of Baidu DeepSpeech2 model. -""" - +"""Trainer for DeepSpeech2 model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -164,7 +161,7 @@ def train(): print("\nPass: %d, Batch: %d, TrainCost: %f" % (event.pass_id, event.batch_id, cost_sum / cost_counter)) cost_sum, cost_counter = 0.0, 0 - with gzip.open("params.tar.gz", 'w') as f: + with gzip.open("params_tmp.tar.gz", 'w') as f: parameters.to_tar(f) else: sys.stdout.write('.') From 1cef98f2101b37c9ff63a02ed6955c99f5edb09e Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 13 Jun 2017 23:33:38 +0800 Subject: [PATCH 22/55] Update README.md for DS2. --- README.md | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7a372e9b..23e0b412 100644 --- a/README.md +++ b/README.md @@ -16,34 +16,48 @@ For some machines, we also need to install libsndfile1. Details to be added. ### Preparing Data ``` -cd data -python librispeech.py -cat manifest.libri.train-* > manifest.libri.train-all +cd datasets +sh run_all.sh cd .. ``` -After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format. +`sh run_all.sh` prepares all ASR datasets (currently, only LibriSpeech available). After running, we have several summarization manifest files in json-format. -By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets. +A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcript text, audio duration) of each audio file within the data set, in json format. Manifest file serves as an interface informing our system of where and what to read the speech samples. + + +More help for arguments: + +``` +python datasets/librispeech/librispeech.py --help +``` + +### Preparing for Training + +``` +python compute_mean_std.py +``` + +`python compute_mean_std.py` computes mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing. More help for arguments: ``` -python librispeech.py --help +python compute_mean_std.py --help ``` -### Traininig +### Training For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all +python train.py --trainer_count 8 --use_gpu False ``` More help for arguments: @@ -55,7 +69,7 @@ python train.py --help ### Inferencing ``` -python infer.py +CUDA_VISIBLE_DEVICES=0 python infer.py ``` More help for arguments: From f85f8558cf8fb6b3037f6d0c4b4be8dd30afdc0d Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 14 Jun 2017 15:00:10 +0800 Subject: [PATCH 23/55] Add unittest. --- tests/test_error_rate.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/test_error_rate.py diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py new file mode 100644 index 00000000..bb6dca30 --- /dev/null +++ b/tests/test_error_rate.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +import unittest +import sys +sys.path.append('..') +import error_rate + + +class TestParse(unittest.TestCase): + def test_wer(self): + ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' + hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6) + + def test_cer_en(self): + ref = 'werewolf' + hyp = 'weae wolf' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.25) < 1e-6) + + def test_cer_zh(self): + ref = u'我是中国人' + hyp = u'我是 美洲人' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) + + +if __name__ == '__main__': + unittest.main() From 04a225ae4f8f7f4af068207627bb65b93bdd5fe6 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 14 Jun 2017 18:14:50 +0800 Subject: [PATCH 24/55] Enable min_batch_num in train.py and update train info print. --- compute_mean_std.py | 0 data_utils/__init__.py | 0 data_utils/audio.py | 0 data_utils/augmentor/__init__.py | 0 data_utils/augmentor/augmentation.py | 0 data_utils/augmentor/base.py | 0 data_utils/augmentor/volume_perturb.py | 0 data_utils/featurizer/__init__.py | 0 data_utils/featurizer/audio_featurizer.py | 0 data_utils/featurizer/speech_featurizer.py | 0 data_utils/featurizer/text_featurizer.py | 0 data_utils/normalizer.py | 0 data_utils/speech.py | 0 data_utils/utils.py | 0 datasets/run_all.sh | 0 decoder.py | 0 train.py | 10 ++++++---- 17 files changed, 6 insertions(+), 4 deletions(-) mode change 100755 => 100644 compute_mean_std.py mode change 100755 => 100644 data_utils/__init__.py mode change 100755 => 100644 data_utils/audio.py mode change 100755 => 100644 data_utils/augmentor/__init__.py mode change 100755 => 100644 data_utils/augmentor/augmentation.py mode change 100755 => 100644 data_utils/augmentor/base.py mode change 100755 => 100644 data_utils/augmentor/volume_perturb.py mode change 100755 => 100644 data_utils/featurizer/__init__.py mode change 100755 => 100644 data_utils/featurizer/audio_featurizer.py mode change 100755 => 100644 data_utils/featurizer/speech_featurizer.py mode change 100755 => 100644 data_utils/featurizer/text_featurizer.py mode change 100755 => 100644 data_utils/normalizer.py mode change 100755 => 100644 data_utils/speech.py mode change 100755 => 100644 data_utils/utils.py mode change 100755 => 100644 datasets/run_all.sh mode change 100755 => 100644 decoder.py diff --git a/compute_mean_std.py b/compute_mean_std.py old mode 100755 new mode 100644 diff --git a/data_utils/__init__.py b/data_utils/__init__.py old mode 100755 new mode 100644 diff --git a/data_utils/audio.py b/data_utils/audio.py old mode 100755 new mode 100644 diff --git a/data_utils/augmentor/__init__.py b/data_utils/augmentor/__init__.py old mode 100755 new mode 100644 diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py old mode 100755 new mode 100644 diff --git a/data_utils/augmentor/base.py b/data_utils/augmentor/base.py old mode 100755 new mode 100644 diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py old mode 100755 new mode 100644 diff --git a/data_utils/featurizer/__init__.py b/data_utils/featurizer/__init__.py old mode 100755 new mode 100644 diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py old mode 100755 new mode 100644 diff --git a/data_utils/featurizer/speech_featurizer.py b/data_utils/featurizer/speech_featurizer.py old mode 100755 new mode 100644 diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py old mode 100755 new mode 100644 diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py old mode 100755 new mode 100644 diff --git a/data_utils/speech.py b/data_utils/speech.py old mode 100755 new mode 100644 diff --git a/data_utils/utils.py b/data_utils/utils.py old mode 100755 new mode 100644 diff --git a/datasets/run_all.sh b/datasets/run_all.sh old mode 100755 new mode 100644 diff --git a/decoder.py b/decoder.py old mode 100755 new mode 100644 diff --git a/train.py b/train.py index 7ac4626f..6074aa35 100644 --- a/train.py +++ b/train.py @@ -143,11 +143,13 @@ def train(): train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, + min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, batch_shuffle=True) test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, + min_batch_size=1, # must be 1, but will have errors. sortagrad=False, batch_shuffle=False) @@ -157,11 +159,11 @@ def train(): if isinstance(event, paddle.event.EndIteration): cost_sum += event.cost cost_counter += 1 - if event.batch_id % 50 == 0: - print("\nPass: %d, Batch: %d, TrainCost: %f" % - (event.pass_id, event.batch_id, cost_sum / cost_counter)) + if (event.batch_id + 1) % 100 == 0: + print("\nPass: %d, Batch: %d, TrainCost: %f" % ( + event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) cost_sum, cost_counter = 0.0, 0 - with gzip.open("params_tmp.tar.gz", 'w') as f: + with gzip.open("params.tar.gz", 'w') as f: parameters.to_tar(f) else: sys.stdout.write('.') From 65e34c535b4444c42c28f14b16a2617a73d296d1 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Thu, 15 Jun 2017 03:08:30 +0800 Subject: [PATCH 25/55] add augmentation --- data_utils/audio.py | 396 ++++++++++++++++- data_utils/augmentor/audio_database.py | 401 ++++++++++++++++++ data_utils/augmentor/augmentation.py | 15 + data_utils/augmentor/implus_response.py | 76 ++++ data_utils/augmentor/noise_speech.py | 318 ++++++++++++++ .../online_bayesian_normalization.py | 57 +++ data_utils/augmentor/resampler.py | 30 ++ data_utils/augmentor/speed_perturb.py | 53 +++ data_utils/augmentor/volume_perturb.py | 4 +- 9 files changed, 1337 insertions(+), 13 deletions(-) create mode 100755 data_utils/augmentor/audio_database.py create mode 100755 data_utils/augmentor/implus_response.py create mode 100755 data_utils/augmentor/noise_speech.py create mode 100755 data_utils/augmentor/online_bayesian_normalization.py create mode 100755 data_utils/augmentor/resampler.py create mode 100755 data_utils/augmentor/speed_perturb.py diff --git a/data_utils/audio.py b/data_utils/audio.py index 916c8ac1..aef13c30 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -6,6 +6,8 @@ from __future__ import print_function import numpy as np import io import soundfile +import scikits.samplerate +from scipy import signal class AudioSegment(object): @@ -62,6 +64,69 @@ class AudioSegment(object): samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) + @classmethod + def slice_from_file(cls, fname, start=None, end=None): + """ + Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param fname: input audio file name + :type fname: bsaestring + :param start: start time in seconds (supported granularity is ms) + If start is negative, it wraps around from the end. If not + provided, this function reads from the very beginning. + :type start: float + :param end: start time in seconds (supported granularity is ms) + If end is negative, it wraps around from the end. If not + provided, the default behvaior is to read to the end of the + file. + :type end: float + + :return:the specified slice of input audio in the audio.AudioSegment + format. + """ + sndfile = soundfile.SoundFile(fname) + + sample_rate = sndfile.samplerate + if sndfile.channels != 1: + raise TypeError("{} has more than 1 channel.".format(fname)) + + duration = float(len(sndfile)) / sample_rate + + if start is None: + start = 0.0 + if end is None: + end = duration + + if start < 0.0: + start += duration + if end < 0.0: + end += duration + + if start < 0.0: + raise IndexError("The slice start position ({} s) is out of " + "bounds. Filename: {}".format(start, fname)) + if end < 0.0: + raise IndexError("The slice end position ({} s) is out of bounds " + "Filename: {}".format(end, fname)) + + if start > end: + raise IndexError("The slice start position ({} s) is later than " + "the slice end position ({} s)." + .format(start, end)) + + if end > duration: + raise ValueError("The slice end time ({} s) is out of " + "bounds (> {} s) Filename: {}" + .format(end, duration, fname)) + + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + + return cls(data, sample_rate) + @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. @@ -75,6 +140,44 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent audio segment of the given duration and + sample rate. + + :param duration: length of silence in seconds + :type duration: scalar + :param sample_rate: sample rate + :type sample_rate: scalar + :returns: silence of the given duration + :rtype: AudioSegment + """ + samples = np.zeros(int(float(duration) * sample_rate)) + return cls(samples, sample_rate) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: input audio segments + :type *segments: [AudioSegment] + """ + # Perform basic sanity-checks. + N = len(segments) + if N == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for segment in segments: + if sample_rate != segment._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(segment) is not cls: + raise TypeError("Only audio segments of the same type " + "instance can be concatenated.") + + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate) + def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -143,23 +246,288 @@ class AudioSegment(object): new_indices = np.linspace(start=0, stop=old_length, num=new_length) self._samples = np.interp(new_indices, old_indices, self._samples) - def normalize(self, target_sample_rate): - raise NotImplementedError() + def normalize(self, target_db=-20, max_gain_db=300.0): + """Normalize audio to desired RMS value in decibels. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels.This value + should be less than 0.0 as 0.0 is full-scale audio. + :type target_db: float, optional + :param max_gain_db: Max amount of gain in dB that can be applied + for normalization. This is to prevent nans when attempting + to normalize a signal consisting of all zeros. + :type max_gain_db: float, optional - def resample(self, target_sample_rate): - raise NotImplementedError() + :raises NormalizationWarning: if the required gain to normalize the + segment to the target_db value exceeds max_gain_db. + """ + gain = target_db - self.rms_db + if gain > max_gain_db: + raise ValueError( + "Unable to normalize segment to {} dB because it has an RMS " + "value of {} dB and the difference exceeds max_gain_db ({} dB)" + .format(target_db, self.rms_db, max_gain_db)) + gain = min(max_gain_db, target_db - self.rms_db) + self.apply_gain(gain) + + def normalize_online_bayesian(self, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + """ + Normalize audio using a production-compatible online/causal algorithm. + This uses an exponential likelihood and gamma prior to make + online estimates of the RMS even when there are very few samples. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels + :type target_bd: scalar + :param prior_db: Prior RMS estimate in decibels + :type prior_db: scalar + :param prior_samples: Prior strength in number of samples + :type prior_samples: scalar + :param startup_delay: Default: 0.0 s. If provided, this + function will accrue statistics for the first startup_delay + seconds before applying online normalization. + :type startup_delay: scalar + """ + # Estimate total RMS online + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + prior_mean_squared = 10.**(prior_db / 10.) + prior_sum_of_squares = prior_mean_squared * prior_samples + cumsum_of_squares = np.cumsum(self.samples**2) + sample_count = np.arange(len(self)) + 1 + if startup_sample_idx > 0: + cumsum_of_squares[:startup_sample_idx] = \ + cumsum_of_squares[startup_sample_idx] + sample_count[:startup_sample_idx] = \ + sample_count[startup_sample_idx] + mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / + (sample_count + prior_samples)) + rms_estimate_db = 10 * np.log10(mean_squared_estimate) + + # Compute required time-varying gain + gain_db = target_db - rms_estimate_db + + # Apply gain to new segment + self.apply_gain(gain_db) + + def normalize_ewma(self, + target_db, + decay_rate, + startup_delay, + rms_eps=1e-6, + max_gain_db=300.0): + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + mean_sq = self.samples**2 + if startup_sample_idx > 0: + mean_sq[:startup_sample_idx] = \ + np.sum(mean_sq[:startup_sample_idx]) / startup_sample_idx + idx_start = max(0, startup_sample_idx - 1) + initial_condition = mean_sq[idx_start] * decay_rate + mean_sq[idx_start:] = lfilter( + [1.0 - decay_rate], [1.0, -decay_rate], + mean_sq[idx_start:], + axis=0, + zi=[initial_condition])[0] + rms_estimate_db = 10.0 * np.log10(mean_sq + rms_eps) + gain_db = target_db - rms_estimate_db + if np.any(gain_db > max_gain_db): + warnings.warn( + "Unable to normalize segment to {} dB because it has an RMS " + "value of {} dB and the difference exceeds max_gain_db ({} dB)" + .format(target_db, self.rms_db, max_gain_db), + NormalizationWarning) + gain_db = np.minimum(gain_db, max_gain_db) + self.apply_gain(gain_db) + + def resample(self, target_sample_rate, quality='sinc_medium'): + """Resample audio and return new AudioSegment. + This resamples the audio to a new sample rate and returns a brand + new AudioSegment. The existing AudioSegment is unchanged. + + Note that this is an in-place transformation. + + :param new_sample_rate: target sample rate + :type new_sample_rate: scalar + :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. + Sets resampling speed/quality tradeoff. + See http://www.mega-nerd.com/SRC/api_misc.html#Converters + :type quality: basestring + """ + resample_ratio = target_sample_rate / self._sample_rate + new_samples = scikits.samplerate.resample( + self._samples, r=resample_ratio, type=quality) + self._samples = new_samples + self._sample_rate = new_sample_rate def pad_silence(self, duration, sides='both'): - raise NotImplementedError() + """Pads this audio sample with a period of silence. + + Note that this is an in-place transformation. + + :param duration: length of silence in seconds to pad + :type duration: float + :param sides: + 'beginning' - adds silence in the beginning + 'end' - adds silence in the end + 'both' - adds silence in both the beginning and the end. + :type sides: basestring + """ + if duration == 0.0: + return self + cls = type(self) + silence = cls.make_silence(duration, self._sample_rate) + if sides == "beginning": + padded = cls.concatenate(silence, self) + elif sides == "end": + padded = cls.concatenate(self, silence) + elif sides == "both": + padded = cls.concatenate(silence, self, silence) + else: + raise ValueError("Unknown value for the kwarg 'sides'") + self._samples = padded._samples + self._sample_rate = padded._sample_rate def subsegment(self, start_sec=None, end_sec=None): - raise NotImplementedError() + """Return new AudioSegment containing audio between given boundaries. + + :param start_sec: Beginning of subsegment in seconds, + (beginning of segment if None). + :type start_sec: scalar + :param end_sec: End of subsegment in seconds, + (end of segment if None). + :type end_sec: scalar + + :return: New AudioSegment containing specified + subsegment. + :trype: AudioSegment + """ + # Default boundaries + if start_sec is None: + start_sec = 0.0 + if end_sec is None: + end_sec = self.duration + + # negative boundaries are relative to end of segment + if start_sec < 0.0: + start_sec = self.duration + start_sec + if end_sec < 0.0: + end_sec = self.duration + end_sec - def convolve(self, filter, allow_resample=False): - raise NotImplementedError() + start_sample = int(round(start_sec * self._sample_rate)) + end_sample = int(round(end_sec * self._sample_rate)) + samples = self._samples[start_sample:end_sample] - def convolve_and_normalize(self, filter, allow_resample=False): - raise NotImplementedError() + return type(self)(samples, sample_rate=self._sample_rate) + + def random_subsegment(self, subsegment_length, rng=None): + """ + Return a random subsegment of a specified length in seconds. + + :param subsegment_length: Subsegment length in seconds. + :type subsegment_length: scalar + :param rng: Random number generator state + :type rng: random.Random [optional] + + + :return:clip (SpeechDLSegment): New SpeechDLSegmen containing random + subsegment of original segment. + """ + if rng is None: + rng = random.Random() + + if subsegment_length > self.duration: + raise ValueError("Length of subsegment must not be greater " + "than original segment.") + start_time = rng.uniform(0.0, self.duration - subsegment_length) + return self.subsegment(start_time, start_time + subsegment_length) + + def convolve(self, ir, allow_resampling=False): + """Convolve this audio segment with the given filter. + + :param ir: impulse response + :type ir: AudioSegment + :param allow_resampling: indicates whether resampling is allowed + when the ir has a different sample rate from this signal. + :type allow_resampling: boolean + """ + if allow_resampling and self.sample_rate != ir.sample_rate: + ir = ir.resample(self.sample_rate) + + if self.sample_rate != ir.sample_rate: + raise ValueError("Impulse response sample rate ({}Hz) is " + "equal to base signal sample rate ({}Hz)." + .format(ir.sample_rate, self.sample_rate)) + + samples = signal.fftconvolve(self.samples, ir.samples, "full") + self._samples = samples + + def convolve_and_normalize(self, ir, allow_resample=False): + """Convolve and normalize the resulting audio segment so that it + has the same average power as the input signal. + + :param ir: impulse response + :type ir: AudioSegment + :param allow_resampling: indicates whether resampling is allowed + when the ir has a different sample rate from this signal. + :type allow_resampling: boolean + """ + self.convolve(ir, allow_resampling=allow_resampling) + self.normalize(target_db=self.rms_db) + + def add_noise(self, + noise, + snr_dB, + allow_downsampling=False, + max_gain_db=300.0, + rng=None): + """Adds the given noise segment at a specific signal-to-noise ratio. + If the noise segment is longer than this segment, a random subsegment + of matching length is sampled from it and used instead. + + :param noise: Noise signal to add. + :type noise: SpeechDLSegment + :param snr_dB: Signal-to-Noise Ratio, in decibels. + :type snr_dB: scalar + :param allow_downsampling: whether to allow the noise signal + to be downsampled to match the base signal sample rate. + :type allow_downsampling: boolean + :param max_gain_db: Maximum amount of gain to apply to noise + signal before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. + :type max_gain_db: scalar + :param rng: Random number generator state. + :type rng: random.Random + + Returns: + SpeechDLSegment: signal with noise added. + """ + if rng is None: + rng = random.Random() + + if allow_downsampling and noise.sample_rate > self.sample_rate: + noise = noise.resample(self.sample_rate) + + if noise.sample_rate != self.sample_rate: + raise ValueError("Noise sample rate ({}Hz) is not equal to " + "base signal sample rate ({}Hz)." + .format(noise.sample_rate, self.sample_rate)) + if noise.duration < self.duration: + raise ValueError("Noise signal ({} sec) must be at " + "least as long as base signal ({} sec)." + .format(noise.duration, self.duration)) + noise_gain_db = self.rms_db - noise.rms_db - snr_dB + noise_gain_db = min(max_gain_db, noise_gain_db) + noise_subsegment = noise.random_subsegment(self.duration, rng=rng) + output = self + self.tranform_noise(noise_subsegment, noise_gain_db) + self._samples = output._samples + self._sample_rate = output._sample_rate @property def samples(self): @@ -186,7 +554,7 @@ class AudioSegment(object): :return: Number of samples. :rtype: int """ - return self._samples.shape(0) + return self._samples.shape[0] @property def duration(self): @@ -250,3 +618,9 @@ class AudioSegment(object): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) + + def tranform_noise(self, noise_subsegment, noise_gain_db): + """ tranform noise file + """ + return type(self)(noise_subsegment._samples * (10.**( + noise_gain_db / 20.)), noise_subsegment._sample_rate) diff --git a/data_utils/augmentor/audio_database.py b/data_utils/augmentor/audio_database.py new file mode 100755 index 00000000..e41c6dd7 --- /dev/null +++ b/data_utils/augmentor/audio_database.py @@ -0,0 +1,401 @@ +from __future__ import print_function +from collections import defaultdict +import bisect +import logging +import numpy as np +import os +import random +import sys + +UNK_TAG = "" + + +def stream_audio_index(fname, UNK=UNK_TAG): + """Reads an audio index file and emits one record in the index at a time. + + :param fname: audio index path + :type fname: basestring + :param UNK: UNK token to denote that certain audios are not tagged. + :type UNK: basesring + + Yields: + idx, duration, size, relpath, tags (int, float, int, str, list(str)): + audio file id, length of the audio in seconds, size in byte, + relative path w.r.t. to the root noise directory, list of tags + """ + with open(fname) as audio_index_file: + for i, line in enumerate(audio_index_file): + tok = line.strip().split("\t") + assert len(tok) >= 4, \ + "Invalid line at line {} in file {}".format( + i + 1, audio_index_file) + idx = int(tok[0]) + duration = float(tok[1]) + # Sometimes, the duration can round down to 0.0 + assert duration >= 0.0, \ + "Invalid duration at line {} in file {}".format( + i + 1, audio_index_file) + size = int(tok[2]) + assert size > 0, \ + "Invalid size at line {} in file {}".format( + i + 1, audio_index_file) + relpath = tok[3] + if len(tok) == 4: + tags = [UNK_TAG] + else: + tags = tok[4:] + yield idx, duration, size, relpath, tags + + +def truncate_float(val, ndigits=6): + """ Truncates a floating-point value to have the desired number of + digits after the decimal point. + + :param val: input value. + :type val: float + :parma ndigits: desired number of digits. + :type ndigits: int + + :return: truncated value + :rtype: float + """ + p = 10.0**ndigits + return float(int(val * p)) / p + + +def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout): + """Prints an audio record to the index file. + + :param idx: Audio file id. + :type idx: int + :param duration: length of the audio in seconds + :type duration: float + :param size: size of the file in bytes + :type size: int + :param relpath: relative path w.r.t. to the root noise directory. + :type relpath: basestring + :parma tags: list of tags + :parma tags: list(str) + :parma file: file to which we want to write an audio record. + :type file: sys.stdout + """ + file.write("{}\t{:.6f}\t{}\t{}" + .format(idx, truncate_float(duration, ndigits=6), size, relpath)) + for tag in tags: + file.write("\t{}".format(tag)) + file.write("\n") + + +class AudioIndex(object): + """ In-memory index of audio files that do not have annotations. + This supports duration-based sampling and sampling from a target + distribution. + + Each line in the index file consists of the following fields: + (id (int), duration (float), size (int), relative path (str), + list of tags ([str])) + """ + + def __init__(self): + self.audio_dir = None + self.index_fname = None + self.tags = None + self.bin_size = 2.0 + self.clear() + + def clear(self): + """ Clears the index + + Returns: + None + """ + self.idx_to_record = {} + # The list of indices correspond to audio files whose duration is + # greater than or equal to the key. + self.duration_to_id_set = {} + self.duration_to_id_set_per_tag = defaultdict(lambda: {}) + self.duration_to_list = defaultdict(lambda: []) + self.duration_to_list_per_tag = defaultdict( + lambda: defaultdict(lambda: [])) + self.tag_to_id_set = defaultdict(lambda: set()) + self.shared_duration_bins = [] + self.id_set_complete = set() + self.id_set = set() + self.duration_bins = [] + + def has_audio(self, distr=None): + """ + :param distr: The target distribution of audio tags that we want to + match. If this is not supplied, the function simply checks that + there are some audio files. + :parma distr: dict + :return: True if there are audio files. + :rtype: boolean + """ + if distr is None: + return len(self.id_set) > 0 + else: + for tag in distr: + if tag not in self.duration_to_list_per_tag: + return False + return True + + def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size): + """Loads all audio records from the disk into memory and groups them + into chunks based on their duration and the bin_size granalarity. + + Once all the records are read, indices are built from these records + by another function so that the audio samples can be drawn efficiently. + + Updates: + self.audio_dir (path): audio root directory + self.idx_fname (path): audio database index filename + self.bin_size (float): granularity of bins + self.idx_to_record (dict): maps from the audio id to + (duration, file_size, relative_path, tags) + self.tag_to_id_set (dict): maps from the tag to + the set of id's of audios that have this tag. + self.id_set_complete (set): set of all audio id's in the index file + self.min_duration (float): minimum audio duration observed in the + index file + self.duration_bins (list): the lower bounds on the duration of + audio files falling in each bin + self.duration_to_id_set (dict): contains (k, v) where v is the set + of id's of audios whose lengths are longer than or equal to k. + (e.g. k is the duration lower bound of this bin). + self.duration_to_id_set_per_tag (dict): Something like above but + has a finer granularity mapping from the tag to + duration_to_id_set. + self.shared_duration_bins (list): list of sets where each set + contains duration lower bounds whose audio id sets are the + same. The rationale for having this is that there are a few + but extremely long audio files which lead to a lot of bins. + When the id sets do not change across various minimum duration + boundaries, we + cluster these together and make them point to the same id set + reference. + + :return: whether the records were read from the disk. The assumption is + that the audio index file on disk and the actual audio files + are constructed once and never change during training. We only + re-read when either the directory or the index file path change. + """ + if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \ + self.bin_size == bin_size: + # The audio directory and/or the list of audio files + # haven't changed. No need to load the list again. + return False + + # Remember where the audio index is most recently read from. + self.audio_dir = audio_dir + self.idx_fname = idx_fname + self.bin_size = bin_size + + # Read in the idx and compute the number of bins necessary + self.clear() + rank = [] + min_duration = float('inf') + max_duration = float('-inf') + for idx, duration, file_size, relpath, tags in \ + stream_audio_index(idx_fname): + self.idx_to_record[idx] = (duration, file_size, relpath, tags) + max_duration = max(max_duration, duration) + min_duration = min(min_duration, duration) + rank.append((duration, idx)) + for tag in tags: + self.tag_to_id_set[tag].add(idx) + if len(rank) == 0: + # file is empty + raise IOError("Index file {} is empty".format(idx_fname)) + for tag in self.tag_to_id_set: + self.id_set_complete |= self.tag_to_id_set[tag] + dur = min_duration + self.min_duration = min_duration + while dur < max_duration + bin_size: + self.duration_bins.append(dur) + dur += bin_size + + # Sort in decreasing order of duration and populate + # the cumulative indices lists. + rank.sort(reverse=True) + + # These are indices for `rank` and used to keep track of whether + # there are new records to add in the current bin. + last = 0 + cur = 0 + + # The set of audios falling in the previous bin; in the case, + # where we don't find new audios for the current bin, we store + # the reference to the last set so as to conserve memory. + # This is not such a big problem if the audio duration is + # bounded by a small number like 30 seconds and the + # bin size is big enough. But, for raw freesound audios, + # some audios can be as long as a few hours! + last_audio_set = set() + + # The same but for each tag so that we can pick audios based on + # tags and also some user-specified tag distribution. + last_audio_set_per_tag = defaultdict(lambda: set()) + + # Set of lists of bins sharing the same audio sets. + shared = set() + + for i in range(len(self.duration_bins) - 1, -1, -1): + lower_bound = self.duration_bins[i] + new_audio_idxs = set() + new_audio_idxs_per_tag = defaultdict(lambda: set()) + while cur < len(rank) and rank[cur][0] >= lower_bound: + idx = rank[cur][1] + tags = self.idx_to_record[idx][3] + new_audio_idxs.add(idx) + for tag in tags: + new_audio_idxs_per_tag[tag].add(idx) + cur += 1 + # This makes certain that the same list is shared across + # different bins if no new indices are added. + if cur == last: + shared.add(lower_bound) + else: + last_audio_set = last_audio_set | new_audio_idxs + for tag in new_audio_idxs_per_tag: + last_audio_set_per_tag[tag] = \ + last_audio_set_per_tag[tag] | \ + new_audio_idxs_per_tag[tag] + if len(shared) > 0: + self.shared_duration_bins.append(shared) + shared = set([lower_bound]) + ### last_audio_set = set() should set blank + last = cur + self.duration_to_id_set[lower_bound] = last_audio_set + for tag in last_audio_set_per_tag: + self.duration_to_id_set_per_tag[lower_bound][tag] = \ + last_audio_set_per_tag[tag] + + # The last `shared` record isn't added to the `shared_duration_bins`. + self.shared_duration_bins.append(shared) + + # We make sure that the while loop above has exhausted through the + # `rank` list by checking if the `cur`rent index in `rank` equals + # the length of the array, which is the halting condition. + assert cur == len(rank) + + return True + + def _build_index_from_records(self, tag_list): + """ Uses the in-memory records read from the index file to build + an in-memory index restricted to the given tag list. + + :param tag_list: List of tags we are interested in sampling from. + :type tag_list: list(str) + + Updates: + self.id_set (set): the set of all audio id's that can be sampled. + self.duration_to_list (dict): maps from the duration lower bound + to the id's of audios longer than this duration. + self.duration_to_list_per_tag (dict): maps from the tag to + the same structure as self.duration_to_list. This is to support + sampling from a target noise distribution. + + :return: whether the index was built from scratch + """ + if self.tags == tag_list: + return False + + self.tags = tag_list + if len(tag_list) == 0: + self.id_set = self.id_set_complete + else: + self.id_set = set() + for tag in tag_list: + self.id_set |= self.tag_to_id_set[tag] + + # Next, we need to take a subset of the audio files + for shared in self.shared_duration_bins: + # All bins in `shared' have the same index lists + # so we can intersect once and set all of them to this list. + lb = list(shared)[0] + intersected = list(self.id_set & self.duration_to_id_set[lb]) + duration_to_id_set = self.duration_to_id_set_per_tag[lb] + intersected_per_tag = { + tag: self.tag_to_id_set[tag] & duration_to_id_set[tag] + for tag in duration_to_id_set + } + for bin_key in shared: + self.duration_to_list[bin_key] = intersected + for tag in intersected_per_tag: + self.duration_to_list_per_tag[tag][bin_key] = \ + intersected_per_tag[tag] + assert len(self.duration_to_list) == len(self.duration_to_id_set) + return True + + def refresh_records_from_index_file(self, + audio_dir, + idx_fname, + tag_list, + bin_size=2.0): + """ Loads the index file and populates the records + for building the internal index. + + If the audio directory or index file name has changed, the whole index + is reloaded from scratch. If only the tag_list is changed, then the + desired index is built from the complete, in-memory record. + + :param audio_dir: audio directory + :type audio_dir: basestring + :param idx_fname: audio index file name + :type idex_fname: basestring + :param tag_list: list of tags we are interested in loading; + if empty, we load all. + :type tag_list: list + :param bin_size: optional argument for controlling the granularity + of duration bins + :type bin_size: float + """ + if tag_list is None: + tag_list = [] + reloaded_records = self._load_all_records_from_disk(audio_dir, + idx_fname, bin_size) + if reloaded_records or self.tags != tag_list: + self._build_index_from_records(tag_list) + logger.info('loaded {} audio files from {}' + .format(len(self.id_set), idx_fname)) + + def sample_audio(self, duration, rng=None, distr=None): + """ Uniformly draws an audio record of at least the desired duration + + :param duration: minimum desired audio duration + :type duration: float + :param rng: random number generator + :type rng: random.Random + :param distr: target distribution of audio tags. If not provided, + :type distr: dict + all audio files are sampled uniformly at random. + + :returns: success, (duration, file_size, path) + """ + if duration < 0.0: + duration = self.min_duration + i = bisect.bisect_left(self.duration_bins, duration) + if i == len(self.duration_bins): + return False, None + bin_key = self.duration_bins[i] + if distr is None: + indices = self.duration_to_list[bin_key] + else: + # If a desired audio distribution is given, we sample from it. + if rng is None: + rng = random.Random() + nprng = np.random.RandomState(rng.getrandbits(32)) + prob_masses = distr.values() + prob_masses /= np.sum(prob_masses) + tag = nprng.choice(distr.keys(), p=prob_masses) + indices = self.duration_to_list_per_tag[tag][bin_key] + if len(indices) == 0: + return False, None + else: + if rng is None: + rng = random.Random() + # duration, file size and relative path from root + s = self.idx_to_record[rng.sample(indices, 1)[0]] + s = (s[0], s[1], os.path.join(self.audio_dir, s[2])) + return True, s diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index abe1a0ec..c0a70ad1 100755 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -6,6 +6,11 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor +from data_utils.augmentor.resamler import ResamplerAugmentor +from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor +from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor +from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor +from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor class AugmentationPipeline(object): @@ -76,5 +81,15 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) + if augmentor_type == "resamle": + return ResamplerAugmentor(self._rng, **params) + if augmentor_type == "speed": + return SpeedPerturbatioAugmentor(self._rng, **params) + if augmentor_type == "online_bayesian_normalization": + return OnlineBayesianNormalizationAugmentor(self._rng, **params) + if augmentor_type == "Impulse_response": + return ImpulseResponseAugmentor(self._rng, **params) + if augmentor_type == "noise_speech": + return NoiseSpeechAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/implus_response.py b/data_utils/augmentor/implus_response.py new file mode 100755 index 00000000..cc205342 --- /dev/null +++ b/data_utils/augmentor/implus_response.py @@ -0,0 +1,76 @@ +""" Impulse response""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base +from . import audio_database +from data_utils.speech import SpeechSegment + + +class ImpulseResponseAugmentor(base.AugmentorBase): + """ Instantiates an impulse response model + + :param ir_dir: directory containing impulse responses + :type ir_dir: basestring + :param tags: optional parameter for specifying what + particular impulse responses to apply. + :type tags: list + :parm tag_distr: optional noise distribution + :type tag_distr: dict + """ + + def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None): + # Define all required parameter maps here. + self.ir_dir = ir_dir + self.index_file = index_file + + self.tags = tags + self.tag_distr = tag_distr + + self.audio_index = audio_database.AudioIndex() + self.rng = rng + + def _init_data(self): + """ Preloads stuff from disk in an attempt (e.g. list of files, etc) + to make later loading faster. If the data configuration remains the + same, this function does nothing. + + """ + self.audio_index.refresh_records_from_index_file( + self.ir_dir, self.index_file, self.tags) + + def transform_audio(self, audio_segment): + """ Convolves the input audio with an impulse response. + + :param audio_segment: input audio + :type audio_segment: AudioSegemnt + """ + # This handles the cases where the data source or directories change. + self._init_data() + + read_size = 0 + tag_distr = self.tag_distr + if not self.audio_index.has_audio(tag_distr): + if tag_distr is None: + if not self.tags: + raise RuntimeError("The ir index does not have audio " + "files to sample from.") + else: + raise RuntimeError("The ir index does not have audio " + "files of the given tags to sample " + "from.") + else: + raise RuntimeError("The ir index does not have audio " + "files to match the target ir " + "distribution.") + else: + # Querying with a negative duration triggers the index to search + # from all impulse responses. + success, record = self.audio_index.sample_audio( + -1.0, rng=self.rng, distr=tag_distr) + if success is True: + _, read_size, ir_fname = record + ir_wav = SpeechSegment.from_file(ir_fname) + audio_segment.convolve(ir_wav, allow_resampling=True) diff --git a/data_utils/augmentor/noise_speech.py b/data_utils/augmentor/noise_speech.py new file mode 100755 index 00000000..8cf7c27b --- /dev/null +++ b/data_utils/augmentor/noise_speech.py @@ -0,0 +1,318 @@ +""" noise speech +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import logging +import numpy as np +import os +from collections import defaultdict + +from . import base +from . import audio_database +from data_utils.speech import SpeechSegment + +TURK = "turk" +USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"]) +HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0 +FIND_NOISE_MAX_ATTEMPTS = 20 + +logger = logging.getLogger(__name__) + + +def get_first_smaller(items, value): + index = bisect.bisect_left(items, value) - 1 + assert items[index] < value, \ + 'get_first_smaller failed! %d %d' % (items[index], value) + return items[index] + + +def get_first_larger(items, value): + 'Find leftmost value greater than value' + index = bisect.bisect_right(items, value) + assert index < len(items), \ + "no noise bin exists for this audio length (%f)" % value + assert items[index] > value, \ + 'get_first_larger failed! %d %d' % (items[index], value) + return items[index] + + +def _get_turk_noise_files(noise_dir, index_file): + """ Creates a map from duration => a list of noise filenames + + :param noise_dir: Directory of noise files which contains + "noise-samples-list" + :type noise_dir: basestring + :param index_file: Noise list + :type index_file: basestring + + returns:noise_files (defaultdict): A map of bins to noise files. + Each key is the duration, and the value is a list of noise + files binned to this duration. Each bin is 2 secs. + + Note: noise-samples-list should contain one line per noise (wav) file + along with its duration in milliseconds + """ + noise_files = defaultdict(list) + if not os.path.exists(index_file): + logger.error('No noise files were found at {}'.format(index_file)) + return noise_files + num_noise_files = 0 + rounded_durations = list(range(0, 65, 2)) + with open(index_file, 'r') as fl: + for line in fl: + fname = os.path.join(noise_dir, line.strip().split()[0]) + duration = float(line.strip().split()[1]) / 1000 + # bin the noise files into length bins rounded by 2 sec + bin_id = get_first_smaller(rounded_durations, duration) + noise_files[bin_id].append(fname) + num_noise_files += 1 + logger.info('Loaded {} turk noise files'.format(num_noise_files)) + return noise_files + + +class NoiseSpeechAugmentor(base.AugmentorBase): + """ Noise addition block + + :param snr_min: minimum signal-to-noise ratio + :type snr_min: float + :param snr_max: maximum signal-to-noise ratio + :type snr_max: float + :param noise_dir: root of where noise files are stored + :type noise_fir: basestring + :param index_file: index of noises of interest in noise_dir + :type index_file: basestring + :param source: select one from + - turk + - freesound + - chime + Note that this field is no longer required for the freesound + and chime + :type source: string + :param tags: optional parameter for specifying what + particular noises we want to add. See above for the available tags. + :type tags: list + :param tag_distr: optional noise distribution + :type tag_distr: dict + """ + + def __init__(self, + rng, + snr_min, + snr_max, + noise_dir, + source, + allow_downsampling=None, + index_file=None, + tags=None, + tag_distr=None): + # Define all required parameter maps here. + self.rng = rng + self.snr_min = snr_min + self.snr_max = snr_max + self.noise_dir = noise_dir + self.source = source + + self.allow_downsampling = allow_downsampling + self.index_file = index_file + self.tags = tags + self.tag_distr = tag_distr + + # When new noise sources are added, make sure to define the + # associated bookkeeping variables here. + self.turk_noise_files = [] + self.turk_noise_dir = None + self.audio_index = audio_database.AudioIndex() + + def _init_data(self): + """ Preloads stuff from disk in an attempt (e.g. list of files, etc) + to make later loading faster. If the data configuration remains the + same, this function does nothing. + + """ + noise_dir = self.noise_dir + index_file = self.index_file + source = self.source + if not index_file: + if source == TURK: + index_file = os.path.join(noise_dir, 'noise-samples-list') + logger.debug("index_file not provided; " + "defaulting to " + + index_file) + else: + if source != "": + assert source in USE_AUDIO_DATABASE_SOURCES, \ + "{} not supported by audio_database".format(source) + index_file = os.path.join(noise_dir, + "audio_index_commercial.txt") + logger.debug("index_file not provided; " + "defaulting to " + + index_file) + + if source == TURK: + if self.turk_noise_dir != noise_dir: + self.turk_noise_dir = noise_dir + self.turk_noise_files = _get_turk_noise_files(noise_dir, + index_file) + # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: + else: + if source != "": + assert source in USE_AUDIO_DATABASE_SOURCES, \ + "{} not supported by audio_database".format(source) + self.audio_index.refresh_records_from_index_file( + self.noise_dir, index_file, self.tags) + + def transform_audio(self, audio_segment): + """Adds walla noise + + :param audio_segment: Input audio + :type audio_segment: SpeechSegment + """ + # This handles the cases where the data source or directories change. + self._init_data + source = self.source + allow_downsampling = self.allow_downsampling + if source == TURK: + self._add_turk_noise(audio_segment, self.rng, allow_downsampling) + # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: + else: + self._add_noise(audio_segment, self.rng, allow_downsampling) + + def _sample_snr(self): + """ Returns a float sampled in [`self.snr_min`, `self.snr_max`] + if both `self.snr_min` and `self.snr_max` are non-zero. + """ + snr_min = self.snr_min + snr_max = self.snr_max + sampled_snr = self.rng.uniform(snr_min, snr_max) + return sampled_snr + + def _add_turk_noise(self, audio_segment, allow_downsampling): + """ Adds a turk noise to the input audio. + + :param audio_segment: input audio + :type audio_segment: audiosegment + :param allow_downsampling: indicates whether downsampling + is allowed + :type allow_downsampling: boolean + """ + read_size = 0 + if len(self.turk_noise_files) > 0: + snr = self._sample_snr(self.rng) + # Draw the noise file randomly from noise files that are + # slightly longer than the utterance + noise_bins = sorted(self.turk_noise_files.keys()) + # note some bins can be empty, so we can't just round up + # to the nearest 2-sec interval + rounded_duration = get_first_larger(noise_bins, + audio_segment.duration) + noise_fname = \ + self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0] + noise = SpeechSegment.from_wav_file(noise_fname) + logger.debug('noise_fname {}'.format(noise_fname)) + logger.debug('snr {}'.format(snr)) + read_size = len(noise) * 2 + # May throw exceptions, but this is caught by + # AudioFeaturizer.get_audio_files. + audio_segment.add_noise( + noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) + + def _add_noise(self, audio_segment, allow_downsampling): + """ Adds a noise indexed in audio_database.AudioIndex. + + :param audio_segment: input audio + :type audio_segment: SpeechSegment + :param allow_downsampling: indicates whether downsampling + is allowed + :type allow_downsampling: boolean + + Returns: + (SpeechSegment, int) + - sound with turk noise added + - number of bytes read from disk + """ + read_size = 0 + tag_distr = self.tag_distr + if not self.audio_index.has_audio(tag_distr): + if tag_distr is None: + if not self.tags: + raise RuntimeError("The noise index does not have audio " + "files to sample from.") + else: + raise RuntimeError("The noise index does not have audio " + "files of the given tags to sample " + "from.") + else: + raise RuntimeError("The noise index does not have audio " + "files to match the target noise " + "distribution.") + else: + # Compute audio segment related statistics + audio_duration = audio_segment.duration + + # Sample relevant augmentation parameters. + snr = self._sample_snr(self.rng) + + # Perhaps, we may not have a sufficiently long noise, so we need + # to search iteratively. + min_duration = audio_duration + 0.25 + for _ in range(FIND_NOISE_MAX_ATTEMPTS): + logger.debug("attempting to find noise of length " + "at least {}".format(min_duration)) + + success, record = \ + self.audio_index.sample_audio(min_duration, + rng=self.rng, + distr=tag_distr) + + if success is True: + noise_duration, read_size, noise_fname = record + + # Assert after logging so we know + # what caused augmentation to fail. + logger.debug("noise_fname {}".format(noise_fname)) + logger.debug("snr {}".format(snr)) + assert noise_duration >= min_duration + break + + # Decrease the desired minimum duration linearly. + # If the value becomes smaller than some threshold, + # we half the value instead. + if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD: + min_duration -= 2.0 + else: + min_duration *= 0.5 + + if success is False: + logger.info("Failed to find a noise file") + return + + diff_duration = audio_duration + 0.25 - noise_duration + if diff_duration >= 0.0: + # Here, the noise is shorter than the audio file, so + # we pad with zeros to make sure the noise sound is applied + # with a uniformly random shift. + noise = SpeechSegment.from_file(noise_fname) + noise = noise.pad_silence(diff_duration, sides="both") + else: + # The noise clip is at least ~25 ms longer than the audio + # segment here. + diff_duration = int(noise_duration * audio_segment.sample_rate) - \ + int(audio_duration * audio_segment.sample_rate) - \ + int(0.02 * audio_segment.sample_rate) + start = float(self.rng.randint(0, diff_duration)) / \ + audio.sample_rate + finish = min(start + audio_duration + 0.2, noise_duration) + noise = SpeechSegment.slice_from_file(noise_fname, start, + finish) + + if len(noise) < len(audio_segment): + # This is to ensure that the noise clip is at least as + # long as the audio segment. + num_samples_to_pad = len(audio_segment) - len(noise) + # Padding this amount of silence on both ends ensures that + # the placement of the noise clip is uniformly random. + silence = SpeechSegment( + np.zeros(num_samples_to_pad), audio_segment.sample_rate) + noise = SpeechSegment.concatenate(silence, noise, silence) + + audio_segment.add_noise( + noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py new file mode 100755 index 00000000..bc2d6c1b --- /dev/null +++ b/data_utils/augmentor/online_bayesian_normalization.py @@ -0,0 +1,57 @@ +""" Online bayesian normalization +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base + + +class OnlineBayesianNormalizationAugmentor(base.AugmentorBase): + """ + Instantiates an online bayesian normalization module. + :param target_db: Target RMS value in decibels + :type target_db: func[int->scalar] + :param prior_db: Prior RMS estimate in decibels + :type prior_db: func[int->scalar] + :param prior_samples: Prior strength in number of samples + :type prior_samples: func[int->scalar] + :param startup_delay: Start-up delay in seconds during + which normalization statistics is accrued. + :type starup_delay: func[int->scalar] + """ + + def __init__(self, + rng, + target_db, + prior_db, + prior_samples, + startup_delay=base.parse_parameter_from(0.0)): + + self.target_db = target_db + self.prior_db = prior_db + self.prior_samples = prior_samples + self.startup_delay = startup_delay + self.rng = rng + + def transform_audio(self, audio_segment): + """ + Normalizes the input audio using the online Bayesian approach. + + :param audio_segment: input audio + :type audio_segment: SpeechSegment + :param iteration: current iteration + :type iteration: int + :param text: audio transcription + :type text: basestring + :param rng: RNG to use for augmentation + :type rng: random.Random + + """ + read_size = 0 + target_db = self.target_db(iteration) + prior_db = self.prior_db(iteration) + prior_samples = self.prior_samples(iteration) + startup_delay = self.startup_delay(iteration) + audio.normalize_online_bayesian( + target_db, prior_db, prior_samples, startup_delay=startup_delay) diff --git a/data_utils/augmentor/resampler.py b/data_utils/augmentor/resampler.py new file mode 100755 index 00000000..1b959be5 --- /dev/null +++ b/data_utils/augmentor/resampler.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base + + +class ResamplerAugmentor(base.AugmentorBase): + """ Instantiates a resampler module. + + :param new_sample_rate: New sample rate in Hz + :type new_sample_rate: func[int->scalar] + :param rng: Random generator object. + :type rng: random.Random + """ + + def __init__(self, rng, new_sample_rate): + self.new_sample_rate = new_sample_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """ Resamples the input audio to the target sample rate. + + Note that this is an in-place transformation. + + :param audio: input audio + :type audio: SpeechDLSegment + """ + new_sample_rate = self.new_sample_rate + audio.resample(new_sample_rate) \ No newline at end of file diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py new file mode 100755 index 00000000..e09be5f7 --- /dev/null +++ b/data_utils/augmentor/speed_perturb.py @@ -0,0 +1,53 @@ +"""Speed perturbation module for making ASR robust to different voice +types (high pitched, low pitched, etc) +Samples uniformly between speed_min and speed_max +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base + + +class SpeedPerturbatioAugmentor(base.AugmentorBase): + """ + Instantiates a speed perturbation module. + + See reference paper here: + + http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf + + :param speed_min: Lower bound on new rate to sample + :type speed_min: func[int->scalar] + :param speed_max: Upper bound on new rate to sample + :type speed_max: func[int->scalar] + """ + + def __init__(self, rng, speed_min, speed_max): + + if (speed_min < 0.9): + raise ValueError( + "Sampling speed below 0.9 can cause unnatural effects") + if (speed_min > 1.1): + raise ValueError( + "Sampling speed above 1.1 can cause unnatural effects") + self.speed_min = speed_min + self.speed_max = speed_max + self.rng = rng + + def transform_audio(self, audio_segment): + """ + Samples a new speed rate from the given range and + changes the speed of the given audio clip. + + Note that this is an in-place transformation. + + :param audio_segment: input audio + :type audio_segment: SpeechDLSegment + """ + read_size = 0 + speed_min = self.speed_min(iteration) + speed_max = self.speed_max(iteration) + sampled_speed = rng.uniform(speed_min, speed_max) + audio = audio.change_speed(sampled_speed) diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py index a5a9f6ca..15055b91 100755 --- a/data_utils/augmentor/volume_perturb.py +++ b/data_utils/augmentor/volume_perturb.py @@ -3,10 +3,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from data_utils.augmentor.base import AugmentorBase +from . import base -class VolumePerturbAugmentor(AugmentorBase): +class VolumePerturbAugmentor(base.AugmentorBase): """Augmentation model for adding random volume perturbation. This is used for multi-loudness training of PCEN. See From ed5f04afb86e7285cdd2d9d36dbf4b63431b5968 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 15 Jun 2017 17:05:00 +0800 Subject: [PATCH 26/55] Add shuffle type of instance_shuffle and batch_shuffle_clipped. --- data_utils/data.py | 50 ++++++++++++++++++++++------- datasets/librispeech/librispeech.py | 3 +- decoder.py | 6 ++-- infer.py | 11 +++---- train.py | 16 ++++++--- utils.py | 25 +++++++++++++++ 6 files changed, 82 insertions(+), 29 deletions(-) create mode 100644 utils.py diff --git a/data_utils/data.py b/data_utils/data.py index 48e03fe8..424343a4 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -80,7 +80,7 @@ class DataGenerator(object): padding_to=-1, flatten=False, sortagrad=False, - batch_shuffle=False): + shuffle_method="batch_shuffle"): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. @@ -104,12 +104,22 @@ class DataGenerator(object): :param sortagrad: If set True, sort the instances by audio duration in the first epoch for speed up training. :type sortagrad: bool - :param batch_shuffle: If set True, instances are batch-wise shuffled. - For more details, please see - ``_batch_shuffle.__doc__``. - If sortagrad is True, batch_shuffle is disabled + :param shuffle_method: Shuffle method. Options: + '' or None: no shuffle. + 'instance_shuffle': instance-wise shuffle. + 'batch_shuffle': similarly-sized instances are + put into batches, and then + batch-wise shuffle the batches. + For more details, please see + ``_batch_shuffle.__doc__``. + 'batch_shuffle_clipped': 'batch_shuffle' with + head shift and tail + clipping. For more + details, please see + ``_batch_shuffle``. + If sortagrad is True, shuffle is disabled for the first epoch. - :type batch_shuffle: bool + :type shuffle_method: None|str :return: Batch reader function, producing batches of data when called. :rtype: callable """ @@ -123,8 +133,20 @@ class DataGenerator(object): # sort (by duration) or batch-wise shuffle the manifest if self._epoch == 0 and sortagrad: manifest.sort(key=lambda x: x["duration"]) - elif batch_shuffle: - manifest = self._batch_shuffle(manifest, batch_size) + else: + if shuffle_method == "batch_shuffle": + manifest = self._batch_shuffle( + manifest, batch_size, clipped=False) + elif shuffle_method == "batch_shuffle_clipped": + manifest = self._batch_shuffle( + manifest, batch_size, clipped=True) + elif shuffle_method == "instance_shuffle": + self._rng.shuffle(manifest) + elif not shuffle_method: + pass + else: + raise ValueError("Unknown shuffle method %s." % + shuffle_method) # prepare batches instance_reader = self._instance_reader_creator(manifest) batch = [] @@ -218,7 +240,7 @@ class DataGenerator(object): new_batch.append((padded_audio, text)) return new_batch - def _batch_shuffle(self, manifest, batch_size): + def _batch_shuffle(self, manifest, batch_size, clipped=False): """Put similarly-sized instances into minibatches for better efficiency and make a batch-wise shuffle. @@ -233,6 +255,9 @@ class DataGenerator(object): :param batch_size: Batch size. This size is also used for generate a random number for batch shuffle. :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool :return: Batch shuffled mainifest. :rtype: list """ @@ -241,7 +266,8 @@ class DataGenerator(object): batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) self._rng.shuffle(batch_manifest) batch_manifest = list(sum(batch_manifest, ())) - res_len = len(manifest) - shift_len - len(batch_manifest) - batch_manifest.extend(manifest[-res_len:]) - batch_manifest.extend(manifest[0:shift_len]) + if not clipped: + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) return batch_manifest diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py index faf038cc..87e52ae4 100644 --- a/datasets/librispeech/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -37,8 +37,7 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" -parser = argparse.ArgumentParser( - description='Downloads and prepare LibriSpeech dataset.') +parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--target_dir", default=DATA_HOME + "/Libri", diff --git a/decoder.py b/decoder.py index 8314885c..77d950b8 100644 --- a/decoder.py +++ b/decoder.py @@ -8,8 +8,7 @@ from itertools import groupby def ctc_best_path_decode(probs_seq, vocabulary): - """ - Best path decoding, also called argmax decoding or greedy decoding. + """Best path decoding, also called argmax decoding or greedy decoding. Path consisting of the most probable tokens are further post-processed to remove consecutive repetitions and all blanks. @@ -38,8 +37,7 @@ def ctc_best_path_decode(probs_seq, vocabulary): def ctc_decode(probs_seq, vocabulary, method): - """ - CTC-like sequence decoding from a sequence of likelihood probablilites. + """CTC-like sequence decoding from a sequence of likelihood probablilites. :param probs_seq: 2-D list of probabilities over the vocabulary for each character. Each element is a list of float probabilities diff --git a/infer.py b/infer.py index f7c99df1..06449ab0 100644 --- a/infer.py +++ b/infer.py @@ -10,9 +10,9 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import deep_speech2 from decoder import ctc_decode +import utils -parser = argparse.ArgumentParser( - description='Simplified version of DeepSpeech2 inference.') +parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--num_samples", default=10, @@ -62,9 +62,7 @@ args = parser.parse_args() def infer(): - """ - Max-ctc-decoding for DeepSpeech2. - """ + """Max-ctc-decoding for DeepSpeech2.""" # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, @@ -98,7 +96,7 @@ def infer(): manifest_path=args.decode_manifest_path, batch_size=args.num_samples, sortagrad=False, - batch_shuffle=False) + shuffle_method=None) infer_data = batch_reader().next() # run inference @@ -123,6 +121,7 @@ def infer(): def main(): + utils.print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=1) infer() diff --git a/train.py b/train.py index 6074aa35..c60a039b 100644 --- a/train.py +++ b/train.py @@ -12,6 +12,7 @@ import distutils.util import paddle.v2 as paddle from model import deep_speech2 from data_utils.data import DataGenerator +import utils parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -51,6 +52,12 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use sortagrad or not. (default: %(default)s)") +parser.add_argument( + "--shuffle_method", + default='instance_shuffle', + type=str, + help="Shuffle method: 'instance_shuffle', 'batch_shuffle', " + "'batch_shuffle_batch'. (default: %(default)s)") parser.add_argument( "--trainer_count", default=4, @@ -93,9 +100,7 @@ args = parser.parse_args() def train(): - """ - DeepSpeech2 training. - """ + """DeepSpeech2 training.""" # initialize data generator def data_generator(): @@ -145,13 +150,13 @@ def train(): batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, - batch_shuffle=True) + shuffle_method=args.shuffle_method) test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, - batch_shuffle=False) + shuffle_method=None) # create event handler def event_handler(event): @@ -186,6 +191,7 @@ def train(): def main(): + utils.print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train() diff --git a/utils.py b/utils.py new file mode 100644 index 00000000..9ca363c8 --- /dev/null +++ b/utils.py @@ -0,0 +1,25 @@ +"""Contains common utility functions.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----- Configuration Arguments -----") + for arg, value in vars(args).iteritems(): + print("%s: %s" % (arg, value)) + print("------------------------------------") From d66d740ea0f6002e2fb48f2b5a304f76205b2fdb Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Fri, 16 Jun 2017 15:17:43 +0800 Subject: [PATCH 27/55] add audio part --- data_utils/audio.py | 457 ++++++++---------- data_utils/augmentor/audio_database.py | 401 --------------- data_utils/augmentor/augmentation.py | 15 - data_utils/augmentor/implus_response.py | 76 --- data_utils/augmentor/noise_speech.py | 318 ------------ .../online_bayesian_normalization.py | 57 --- data_utils/augmentor/resampler.py | 30 -- data_utils/augmentor/speed_perturb.py | 53 -- data_utils/augmentor/volume_perturb.py | 4 +- requirements.txt | 2 + 10 files changed, 215 insertions(+), 1198 deletions(-) delete mode 100755 data_utils/augmentor/audio_database.py delete mode 100755 data_utils/augmentor/implus_response.py delete mode 100755 data_utils/augmentor/noise_speech.py delete mode 100755 data_utils/augmentor/online_bayesian_normalization.py delete mode 100755 data_utils/augmentor/resampler.py delete mode 100755 data_utils/augmentor/speed_perturb.py diff --git a/data_utils/audio.py b/data_utils/audio.py index aef13c30..ee4e6d84 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -8,6 +8,7 @@ import io import soundfile import scikits.samplerate from scipy import signal +import random class AudioSegment(object): @@ -46,6 +47,32 @@ class AudioSegment(object): """Return whether two objects are unequal.""" return not self.__eq__(other) + def __len__(self): + """Returns length of segment in samples.""" + return self.num_samples + + def __add__(self, other): + """Add samples from another segment to those of this segment and return + a new segment (sample-wise addition, not segment concatenation). + + :param other: Segment containing samples to be + added in. + :type other: AudioSegment + :return: New segment containing resulting samples. + :rtype: AudioSegment + :raise TypeError: If sample rates of segments don't match, + or if length of segments don't match. + """ + if type(self) != type(other): + raise TypeError("Cannot add segment of different type: {}" + .format(type(other))) + if self._sample_rate != other._sample_rate: + raise TypeError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise TypeError("Segment lengths must match to add segments.") + samples = self.samples + other.samples + return type(self)(samples, sample_rate=self._sample_rate) + def __str__(self): """Return human-readable representation of segment.""" return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " @@ -64,69 +91,6 @@ class AudioSegment(object): samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) - @classmethod - def slice_from_file(cls, fname, start=None, end=None): - """ - Loads a small section of an audio without having to load - the entire file into the memory which can be incredibly wasteful. - - :param fname: input audio file name - :type fname: bsaestring - :param start: start time in seconds (supported granularity is ms) - If start is negative, it wraps around from the end. If not - provided, this function reads from the very beginning. - :type start: float - :param end: start time in seconds (supported granularity is ms) - If end is negative, it wraps around from the end. If not - provided, the default behvaior is to read to the end of the - file. - :type end: float - - :return:the specified slice of input audio in the audio.AudioSegment - format. - """ - sndfile = soundfile.SoundFile(fname) - - sample_rate = sndfile.samplerate - if sndfile.channels != 1: - raise TypeError("{} has more than 1 channel.".format(fname)) - - duration = float(len(sndfile)) / sample_rate - - if start is None: - start = 0.0 - if end is None: - end = duration - - if start < 0.0: - start += duration - if end < 0.0: - end += duration - - if start < 0.0: - raise IndexError("The slice start position ({} s) is out of " - "bounds. Filename: {}".format(start, fname)) - if end < 0.0: - raise IndexError("The slice end position ({} s) is out of bounds " - "Filename: {}".format(end, fname)) - - if start > end: - raise IndexError("The slice start position ({} s) is later than " - "the slice end position ({} s)." - .format(start, end)) - - if end > duration: - raise ValueError("The slice end time ({} s) is out of " - "bounds (> {} s) Filename: {}" - .format(end, duration, fname)) - - start_frame = int(start * sample_rate) - end_frame = int(end * sample_rate) - sndfile.seek(start_frame) - data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - - return cls(data, sample_rate) - @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. @@ -140,43 +104,30 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) - @classmethod - def make_silence(cls, duration, sample_rate): - """Creates a silent audio segment of the given duration and - sample rate. - - :param duration: length of silence in seconds - :type duration: scalar - :param sample_rate: sample rate - :type sample_rate: scalar - :returns: silence of the given duration - :rtype: AudioSegment - """ - samples = np.zeros(int(float(duration) * sample_rate)) - return cls(samples, sample_rate) - - @classmethod - def concatenate(cls, *segments): + def concatenate(self, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: input audio segments - :type *segments: [AudioSegment] + :param *segments: Input audio segments + :type *segments: AudioSegment + :return: Audio segment instance. + :rtype: AudioSegment + :raises ValueError: If number of segments is zero, or if sample_rate + not match between two audio segments + :raises TypeError: If item of segments is not Audiosegment instance """ # Perform basic sanity-checks. - N = len(segments) - if N == 0: + if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") sample_rate = segments[0]._sample_rate - for segment in segments: - if sample_rate != segment._sample_rate: + for seg in segments: + if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " "different sample rates") - if type(segment) is not cls: + if type(seg) is not type(self): raise TypeError("Only audio segments of the same type " "instance can be concatenated.") - samples = np.concatenate([seg.samples for seg in segments]) - return cls(samples, sample_rate) + return type(self)(samples, sample_rate) def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -203,6 +154,65 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) + def slice_from_file(self, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath + :type file: basestring + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: The specified slice of input audio in the audio.AudioSegment format. + :rtype: AudioSegment + :rainse ValueError: If the position is error, or if the time is out bounds. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = 0. if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds. Filename: %s" % (start, file)) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds " + "Filename: %s" % (end, file)) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end time (%f s) is out of bounds " + "(> %f s) Filename: %s" % (end, duration, file)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return type(self)(data, sample_rate) + + def make_silence(self, duration, sample_rate): + """Creates a silent audio segment of the given duration and + sample rate. + + :param duration: Length of silence in seconds + :type duration: float + :param sample_rate: Sample rate + :type sample_rate: float + :return: Silence of the given duration + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return type(self)(samples, sample_rate) + def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. @@ -247,52 +257,49 @@ class AudioSegment(object): self._samples = np.interp(new_indices, old_indices, self._samples) def normalize(self, target_db=-20, max_gain_db=300.0): - """Normalize audio to desired RMS value in decibels. + """Normalize audio to be desired RMS value in decibels. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels.This value - should be less than 0.0 as 0.0 is full-scale audio. - :type target_db: float, optional - :param max_gain_db: Max amount of gain in dB that can be applied - for normalization. This is to prevent nans when attempting - to normalize a signal consisting of all zeros. - :type max_gain_db: float, optional - - :raises NormalizationWarning: if the required gain to normalize the - segment to the target_db value exceeds max_gain_db. + :param target_db: Target RMS value in decibels. This value should + be less than 0.0 as 0.0 is full-scale audio. + :type target_db: float + :param max_gain_db: Max amount of gain in dB that can be applied for + normalization. This is to prevent nans when attempting + to normalize a signal consisting of all zeros. + :type max_gain_db: float + :raises ValueError: If the required gain to normalize the segment to + the target_db value exceeds max_gain_db. """ gain = target_db - self.rms_db if gain > max_gain_db: raise ValueError( - "Unable to normalize segment to {} dB because it has an RMS " - "value of {} dB and the difference exceeds max_gain_db ({} dB)" - .format(target_db, self.rms_db, max_gain_db)) - gain = min(max_gain_db, target_db - self.rms_db) - self.apply_gain(gain) + "Unable to normalize segment to %f dB because it has an RMS " + "value of %f dB and the difference exceeds max_gain_db (%f dB)" + % (target_db, self.rms_db, max_gain_db)) + self.apply_gain(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, target_db, prior_db, prior_samples, startup_delay=0.0): - """ - Normalize audio using a production-compatible online/causal algorithm. - This uses an exponential likelihood and gamma prior to make - online estimates of the RMS even when there are very few samples. + """Normalize audio using a production-compatible online/causal algorithm. + This uses an exponential likelihood and gamma prior to make online estimates + of the RMS even when there are very few samples. Note that this is an in-place transformation. :param target_db: Target RMS value in decibels - :type target_bd: scalar + :type target_bd: float :param prior_db: Prior RMS estimate in decibels - :type prior_db: scalar + :type prior_db: float :param prior_samples: Prior strength in number of samples - :type prior_samples: scalar - :param startup_delay: Default: 0.0 s. If provided, this - function will accrue statistics for the first startup_delay - seconds before applying online normalization. - :type startup_delay: scalar + :type prior_samples: float + :param startup_delay: Default 0.0 s. If provided, this function will accrue + statistics for the first startup_delay seconds before + applying online normalization. + :type startup_delay: float """ # Estimate total RMS online startup_sample_idx = min(self.num_samples - 1, @@ -309,88 +316,54 @@ class AudioSegment(object): mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / (sample_count + prior_samples)) rms_estimate_db = 10 * np.log10(mean_squared_estimate) - # Compute required time-varying gain gain_db = target_db - rms_estimate_db - - # Apply gain to new segment - self.apply_gain(gain_db) - - def normalize_ewma(self, - target_db, - decay_rate, - startup_delay, - rms_eps=1e-6, - max_gain_db=300.0): - startup_sample_idx = min(self.num_samples - 1, - int(self.sample_rate * startup_delay)) - mean_sq = self.samples**2 - if startup_sample_idx > 0: - mean_sq[:startup_sample_idx] = \ - np.sum(mean_sq[:startup_sample_idx]) / startup_sample_idx - idx_start = max(0, startup_sample_idx - 1) - initial_condition = mean_sq[idx_start] * decay_rate - mean_sq[idx_start:] = lfilter( - [1.0 - decay_rate], [1.0, -decay_rate], - mean_sq[idx_start:], - axis=0, - zi=[initial_condition])[0] - rms_estimate_db = 10.0 * np.log10(mean_sq + rms_eps) - gain_db = target_db - rms_estimate_db - if np.any(gain_db > max_gain_db): - warnings.warn( - "Unable to normalize segment to {} dB because it has an RMS " - "value of {} dB and the difference exceeds max_gain_db ({} dB)" - .format(target_db, self.rms_db, max_gain_db), - NormalizationWarning) - gain_db = np.minimum(gain_db, max_gain_db) self.apply_gain(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): - """Resample audio and return new AudioSegment. - This resamples the audio to a new sample rate and returns a brand - new AudioSegment. The existing AudioSegment is unchanged. + """Resample audio segment. This resamples the audio to a new + sample rate. Note that this is an in-place transformation. - :param new_sample_rate: target sample rate - :type new_sample_rate: scalar + :param target_sample_rate: Target sample rate + :type target_sample_rate: int :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. - Sets resampling speed/quality tradeoff. - See http://www.mega-nerd.com/SRC/api_misc.html#Converters + Sets resampling speed/quality tradeoff. + See http://www.mega-nerd.com/SRC/api_misc.html#Converters :type quality: basestring """ resample_ratio = target_sample_rate / self._sample_rate new_samples = scikits.samplerate.resample( self._samples, r=resample_ratio, type=quality) self._samples = new_samples - self._sample_rate = new_sample_rate + self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): """Pads this audio sample with a period of silence. Note that this is an in-place transformation. - :param duration: length of silence in seconds to pad + :param duration: Length of silence in seconds to pad :type duration: float - :param sides: - 'beginning' - adds silence in the beginning - 'end' - adds silence in the end - 'both' - adds silence in both the beginning and the end. - :type sides: basestring + :param sides: Position for padding + 'beginning' - adds silence in the beginning + 'end' - adds silence in the end + 'both' - adds silence in both the beginning and the end. + :type sides: str + :raises ValueError: If the sides not surport """ if duration == 0.0: return self - cls = type(self) - silence = cls.make_silence(duration, self._sample_rate) + silence = self.make_silence(duration, self._sample_rate) if sides == "beginning": - padded = cls.concatenate(silence, self) + padded = self.concatenate(silence, self) elif sides == "end": - padded = cls.concatenate(self, silence) + padded = self.concatenate(self, silence) elif sides == "both": - padded = cls.concatenate(silence, self, silence) + padded = self.concatenate(silence, self, silence) else: - raise ValueError("Unknown value for the kwarg 'sides'") + raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples self._sample_rate = padded._sample_rate @@ -398,88 +371,83 @@ class AudioSegment(object): """Return new AudioSegment containing audio between given boundaries. :param start_sec: Beginning of subsegment in seconds, - (beginning of segment if None). - :type start_sec: scalar + (beginning of segment if None). + :type start_sec: float :param end_sec: End of subsegment in seconds, - (end of segment if None). - :type end_sec: scalar - - :return: New AudioSegment containing specified - subsegment. - :trype: AudioSegment + (end of segment if None). + :type end_sec: float + :return: New AudioSegment containing specified subsegment. + :rtype: AudioSegment """ - # Default boundaries - if start_sec is None: - start_sec = 0.0 - if end_sec is None: - end_sec = self.duration - + start_sec = 0.0 if start_sec is None else start_sec + end_sec = self.duration if end_sec is None else end_sec # negative boundaries are relative to end of segment if start_sec < 0.0: start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec - start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) samples = self._samples[start_sample:end_sample] - return type(self)(samples, sample_rate=self._sample_rate) def random_subsegment(self, subsegment_length, rng=None): - """ - Return a random subsegment of a specified length in seconds. + """Return a random subsegment of a specified length in seconds. :param subsegment_length: Subsegment length in seconds. - :type subsegment_length: scalar + :type subsegment_length: float :param rng: Random number generator state - :type rng: random.Random [optional] - - - :return:clip (SpeechDLSegment): New SpeechDLSegmen containing random - subsegment of original segment. + :type rng: random.Random + :return: New AudioSegment containing random subsegment + of original segment + :rtype: AudioSegment + :raises ValueError: If the length of subsegment greater than origineal + segemnt. """ - if rng is None: - rng = random.Random() - + rng = random.Random() if rng is None else rng if subsegment_length > self.duration: raise ValueError("Length of subsegment must not be greater " "than original segment.") start_time = rng.uniform(0.0, self.duration - subsegment_length) return self.subsegment(start_time, start_time + subsegment_length) - def convolve(self, ir, allow_resampling=False): + def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given filter. - :param ir: impulse response - :type ir: AudioSegment - :param allow_resampling: indicates whether resampling is allowed - when the ir has a different sample rate from this signal. - :type allow_resampling: boolean - """ - if allow_resampling and self.sample_rate != ir.sample_rate: - ir = ir.resample(self.sample_rate) - - if self.sample_rate != ir.sample_rate: - raise ValueError("Impulse response sample rate ({}Hz) is " - "equal to base signal sample rate ({}Hz)." - .format(ir.sample_rate, self.sample_rate)) + Note that this is an in-place transformation. - samples = signal.fftconvolve(self.samples, ir.samples, "full") + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: boolean + :raises ValueError: If the sample rate is not match between two + audio segments and resample is not allowed. + """ + if allow_resample and self.sample_rate != impulse_segment.sample_rate: + impulse_segment = impulse_segment.resample(self.sample_rate) + if self.sample_rate != impulse_segment.sample_rate: + raise ValueError("Impulse segment's sample rate (%d Hz) is not" + "equal to base signal sample rate (%d Hz)." % + (impulse_segment.sample_rate, self.sample_rate)) + samples = signal.fftconvolve(self.samples, impulse_segment.samples, + "full") self._samples = samples - def convolve_and_normalize(self, ir, allow_resample=False): + def convolve_and_normalize(self, impulse_segment, allow_resample=False): """Convolve and normalize the resulting audio segment so that it has the same average power as the input signal. - :param ir: impulse response - :type ir: AudioSegment - :param allow_resampling: indicates whether resampling is allowed - when the ir has a different sample rate from this signal. - :type allow_resampling: boolean + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: indicates whether resampling is allowed when + the impulse_segment has a different sample rate from this signal. + :type allow_resample: boolean """ - self.convolve(ir, allow_resampling=allow_resampling) - self.normalize(target_db=self.rms_db) + target_db = self.rms_db + self.convolve(impulse_segment, allow_resample=allow_resample) + self.normalize(target_db) def add_noise(self, noise, @@ -492,36 +460,33 @@ class AudioSegment(object): of matching length is sampled from it and used instead. :param noise: Noise signal to add. - :type noise: SpeechDLSegment + :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. - :type snr_dB: scalar - :param allow_downsampling: whether to allow the noise signal - to be downsampled to match the base signal sample rate. + :type snr_dB: float + :param allow_downsampling: whether to allow the noise signal to be downsampled + to match the base signal sample rate. :type allow_downsampling: boolean - :param max_gain_db: Maximum amount of gain to apply to noise - signal before adding it in. This is to prevent attempting - to apply infinite gain to a zero signal. - :type max_gain_db: scalar + :param max_gain_db: Maximum amount of gain to apply to noise signal before + adding it in. This is to prevent attempting to apply infinite + gain to a zero signal. + :type max_gain_db: float :param rng: Random number generator state. :type rng: random.Random - - Returns: - SpeechDLSegment: signal with noise added. + :raises ValueError: If the sample rate does not match between the two audio segments + and resample is not allowed, or if the duration of noise segments + is shorter than original audio segments. """ - if rng is None: - rng = random.Random() - + rng = random.Random() if rng is None else rng if allow_downsampling and noise.sample_rate > self.sample_rate: noise = noise.resample(self.sample_rate) - if noise.sample_rate != self.sample_rate: - raise ValueError("Noise sample rate ({}Hz) is not equal to " - "base signal sample rate ({}Hz)." - .format(noise.sample_rate, self.sample_rate)) + raise ValueError("Noise sample rate (%d Hz) is not equal to " + "base signal sample rate (%d Hz)." % + (noise.sample_rate, self.sample_rate)) if noise.duration < self.duration: - raise ValueError("Noise signal ({} sec) must be at " - "least as long as base signal ({} sec)." - .format(noise.duration, self.duration)) + raise ValueError("Noise signal (%f sec) must be at " + "least as long as base signal (%f sec)." % + (noise.duration, self.duration)) noise_gain_db = self.rms_db - noise.rms_db - snr_dB noise_gain_db = min(max_gain_db, noise_gain_db) noise_subsegment = noise.random_subsegment(self.duration, rng=rng) @@ -529,6 +494,12 @@ class AudioSegment(object): self._samples = output._samples self._sample_rate = output._sample_rate + def tranform_noise(self, noise_subsegment, noise_gain_db): + """ tranform noise file + """ + return type(self)(noise_subsegment._samples * (10.**( + noise_gain_db / 20.)), noise_subsegment._sample_rate) + @property def samples(self): """Return audio samples. @@ -618,9 +589,3 @@ class AudioSegment(object): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) - - def tranform_noise(self, noise_subsegment, noise_gain_db): - """ tranform noise file - """ - return type(self)(noise_subsegment._samples * (10.**( - noise_gain_db / 20.)), noise_subsegment._sample_rate) diff --git a/data_utils/augmentor/audio_database.py b/data_utils/augmentor/audio_database.py deleted file mode 100755 index e41c6dd7..00000000 --- a/data_utils/augmentor/audio_database.py +++ /dev/null @@ -1,401 +0,0 @@ -from __future__ import print_function -from collections import defaultdict -import bisect -import logging -import numpy as np -import os -import random -import sys - -UNK_TAG = "" - - -def stream_audio_index(fname, UNK=UNK_TAG): - """Reads an audio index file and emits one record in the index at a time. - - :param fname: audio index path - :type fname: basestring - :param UNK: UNK token to denote that certain audios are not tagged. - :type UNK: basesring - - Yields: - idx, duration, size, relpath, tags (int, float, int, str, list(str)): - audio file id, length of the audio in seconds, size in byte, - relative path w.r.t. to the root noise directory, list of tags - """ - with open(fname) as audio_index_file: - for i, line in enumerate(audio_index_file): - tok = line.strip().split("\t") - assert len(tok) >= 4, \ - "Invalid line at line {} in file {}".format( - i + 1, audio_index_file) - idx = int(tok[0]) - duration = float(tok[1]) - # Sometimes, the duration can round down to 0.0 - assert duration >= 0.0, \ - "Invalid duration at line {} in file {}".format( - i + 1, audio_index_file) - size = int(tok[2]) - assert size > 0, \ - "Invalid size at line {} in file {}".format( - i + 1, audio_index_file) - relpath = tok[3] - if len(tok) == 4: - tags = [UNK_TAG] - else: - tags = tok[4:] - yield idx, duration, size, relpath, tags - - -def truncate_float(val, ndigits=6): - """ Truncates a floating-point value to have the desired number of - digits after the decimal point. - - :param val: input value. - :type val: float - :parma ndigits: desired number of digits. - :type ndigits: int - - :return: truncated value - :rtype: float - """ - p = 10.0**ndigits - return float(int(val * p)) / p - - -def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout): - """Prints an audio record to the index file. - - :param idx: Audio file id. - :type idx: int - :param duration: length of the audio in seconds - :type duration: float - :param size: size of the file in bytes - :type size: int - :param relpath: relative path w.r.t. to the root noise directory. - :type relpath: basestring - :parma tags: list of tags - :parma tags: list(str) - :parma file: file to which we want to write an audio record. - :type file: sys.stdout - """ - file.write("{}\t{:.6f}\t{}\t{}" - .format(idx, truncate_float(duration, ndigits=6), size, relpath)) - for tag in tags: - file.write("\t{}".format(tag)) - file.write("\n") - - -class AudioIndex(object): - """ In-memory index of audio files that do not have annotations. - This supports duration-based sampling and sampling from a target - distribution. - - Each line in the index file consists of the following fields: - (id (int), duration (float), size (int), relative path (str), - list of tags ([str])) - """ - - def __init__(self): - self.audio_dir = None - self.index_fname = None - self.tags = None - self.bin_size = 2.0 - self.clear() - - def clear(self): - """ Clears the index - - Returns: - None - """ - self.idx_to_record = {} - # The list of indices correspond to audio files whose duration is - # greater than or equal to the key. - self.duration_to_id_set = {} - self.duration_to_id_set_per_tag = defaultdict(lambda: {}) - self.duration_to_list = defaultdict(lambda: []) - self.duration_to_list_per_tag = defaultdict( - lambda: defaultdict(lambda: [])) - self.tag_to_id_set = defaultdict(lambda: set()) - self.shared_duration_bins = [] - self.id_set_complete = set() - self.id_set = set() - self.duration_bins = [] - - def has_audio(self, distr=None): - """ - :param distr: The target distribution of audio tags that we want to - match. If this is not supplied, the function simply checks that - there are some audio files. - :parma distr: dict - :return: True if there are audio files. - :rtype: boolean - """ - if distr is None: - return len(self.id_set) > 0 - else: - for tag in distr: - if tag not in self.duration_to_list_per_tag: - return False - return True - - def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size): - """Loads all audio records from the disk into memory and groups them - into chunks based on their duration and the bin_size granalarity. - - Once all the records are read, indices are built from these records - by another function so that the audio samples can be drawn efficiently. - - Updates: - self.audio_dir (path): audio root directory - self.idx_fname (path): audio database index filename - self.bin_size (float): granularity of bins - self.idx_to_record (dict): maps from the audio id to - (duration, file_size, relative_path, tags) - self.tag_to_id_set (dict): maps from the tag to - the set of id's of audios that have this tag. - self.id_set_complete (set): set of all audio id's in the index file - self.min_duration (float): minimum audio duration observed in the - index file - self.duration_bins (list): the lower bounds on the duration of - audio files falling in each bin - self.duration_to_id_set (dict): contains (k, v) where v is the set - of id's of audios whose lengths are longer than or equal to k. - (e.g. k is the duration lower bound of this bin). - self.duration_to_id_set_per_tag (dict): Something like above but - has a finer granularity mapping from the tag to - duration_to_id_set. - self.shared_duration_bins (list): list of sets where each set - contains duration lower bounds whose audio id sets are the - same. The rationale for having this is that there are a few - but extremely long audio files which lead to a lot of bins. - When the id sets do not change across various minimum duration - boundaries, we - cluster these together and make them point to the same id set - reference. - - :return: whether the records were read from the disk. The assumption is - that the audio index file on disk and the actual audio files - are constructed once and never change during training. We only - re-read when either the directory or the index file path change. - """ - if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \ - self.bin_size == bin_size: - # The audio directory and/or the list of audio files - # haven't changed. No need to load the list again. - return False - - # Remember where the audio index is most recently read from. - self.audio_dir = audio_dir - self.idx_fname = idx_fname - self.bin_size = bin_size - - # Read in the idx and compute the number of bins necessary - self.clear() - rank = [] - min_duration = float('inf') - max_duration = float('-inf') - for idx, duration, file_size, relpath, tags in \ - stream_audio_index(idx_fname): - self.idx_to_record[idx] = (duration, file_size, relpath, tags) - max_duration = max(max_duration, duration) - min_duration = min(min_duration, duration) - rank.append((duration, idx)) - for tag in tags: - self.tag_to_id_set[tag].add(idx) - if len(rank) == 0: - # file is empty - raise IOError("Index file {} is empty".format(idx_fname)) - for tag in self.tag_to_id_set: - self.id_set_complete |= self.tag_to_id_set[tag] - dur = min_duration - self.min_duration = min_duration - while dur < max_duration + bin_size: - self.duration_bins.append(dur) - dur += bin_size - - # Sort in decreasing order of duration and populate - # the cumulative indices lists. - rank.sort(reverse=True) - - # These are indices for `rank` and used to keep track of whether - # there are new records to add in the current bin. - last = 0 - cur = 0 - - # The set of audios falling in the previous bin; in the case, - # where we don't find new audios for the current bin, we store - # the reference to the last set so as to conserve memory. - # This is not such a big problem if the audio duration is - # bounded by a small number like 30 seconds and the - # bin size is big enough. But, for raw freesound audios, - # some audios can be as long as a few hours! - last_audio_set = set() - - # The same but for each tag so that we can pick audios based on - # tags and also some user-specified tag distribution. - last_audio_set_per_tag = defaultdict(lambda: set()) - - # Set of lists of bins sharing the same audio sets. - shared = set() - - for i in range(len(self.duration_bins) - 1, -1, -1): - lower_bound = self.duration_bins[i] - new_audio_idxs = set() - new_audio_idxs_per_tag = defaultdict(lambda: set()) - while cur < len(rank) and rank[cur][0] >= lower_bound: - idx = rank[cur][1] - tags = self.idx_to_record[idx][3] - new_audio_idxs.add(idx) - for tag in tags: - new_audio_idxs_per_tag[tag].add(idx) - cur += 1 - # This makes certain that the same list is shared across - # different bins if no new indices are added. - if cur == last: - shared.add(lower_bound) - else: - last_audio_set = last_audio_set | new_audio_idxs - for tag in new_audio_idxs_per_tag: - last_audio_set_per_tag[tag] = \ - last_audio_set_per_tag[tag] | \ - new_audio_idxs_per_tag[tag] - if len(shared) > 0: - self.shared_duration_bins.append(shared) - shared = set([lower_bound]) - ### last_audio_set = set() should set blank - last = cur - self.duration_to_id_set[lower_bound] = last_audio_set - for tag in last_audio_set_per_tag: - self.duration_to_id_set_per_tag[lower_bound][tag] = \ - last_audio_set_per_tag[tag] - - # The last `shared` record isn't added to the `shared_duration_bins`. - self.shared_duration_bins.append(shared) - - # We make sure that the while loop above has exhausted through the - # `rank` list by checking if the `cur`rent index in `rank` equals - # the length of the array, which is the halting condition. - assert cur == len(rank) - - return True - - def _build_index_from_records(self, tag_list): - """ Uses the in-memory records read from the index file to build - an in-memory index restricted to the given tag list. - - :param tag_list: List of tags we are interested in sampling from. - :type tag_list: list(str) - - Updates: - self.id_set (set): the set of all audio id's that can be sampled. - self.duration_to_list (dict): maps from the duration lower bound - to the id's of audios longer than this duration. - self.duration_to_list_per_tag (dict): maps from the tag to - the same structure as self.duration_to_list. This is to support - sampling from a target noise distribution. - - :return: whether the index was built from scratch - """ - if self.tags == tag_list: - return False - - self.tags = tag_list - if len(tag_list) == 0: - self.id_set = self.id_set_complete - else: - self.id_set = set() - for tag in tag_list: - self.id_set |= self.tag_to_id_set[tag] - - # Next, we need to take a subset of the audio files - for shared in self.shared_duration_bins: - # All bins in `shared' have the same index lists - # so we can intersect once and set all of them to this list. - lb = list(shared)[0] - intersected = list(self.id_set & self.duration_to_id_set[lb]) - duration_to_id_set = self.duration_to_id_set_per_tag[lb] - intersected_per_tag = { - tag: self.tag_to_id_set[tag] & duration_to_id_set[tag] - for tag in duration_to_id_set - } - for bin_key in shared: - self.duration_to_list[bin_key] = intersected - for tag in intersected_per_tag: - self.duration_to_list_per_tag[tag][bin_key] = \ - intersected_per_tag[tag] - assert len(self.duration_to_list) == len(self.duration_to_id_set) - return True - - def refresh_records_from_index_file(self, - audio_dir, - idx_fname, - tag_list, - bin_size=2.0): - """ Loads the index file and populates the records - for building the internal index. - - If the audio directory or index file name has changed, the whole index - is reloaded from scratch. If only the tag_list is changed, then the - desired index is built from the complete, in-memory record. - - :param audio_dir: audio directory - :type audio_dir: basestring - :param idx_fname: audio index file name - :type idex_fname: basestring - :param tag_list: list of tags we are interested in loading; - if empty, we load all. - :type tag_list: list - :param bin_size: optional argument for controlling the granularity - of duration bins - :type bin_size: float - """ - if tag_list is None: - tag_list = [] - reloaded_records = self._load_all_records_from_disk(audio_dir, - idx_fname, bin_size) - if reloaded_records or self.tags != tag_list: - self._build_index_from_records(tag_list) - logger.info('loaded {} audio files from {}' - .format(len(self.id_set), idx_fname)) - - def sample_audio(self, duration, rng=None, distr=None): - """ Uniformly draws an audio record of at least the desired duration - - :param duration: minimum desired audio duration - :type duration: float - :param rng: random number generator - :type rng: random.Random - :param distr: target distribution of audio tags. If not provided, - :type distr: dict - all audio files are sampled uniformly at random. - - :returns: success, (duration, file_size, path) - """ - if duration < 0.0: - duration = self.min_duration - i = bisect.bisect_left(self.duration_bins, duration) - if i == len(self.duration_bins): - return False, None - bin_key = self.duration_bins[i] - if distr is None: - indices = self.duration_to_list[bin_key] - else: - # If a desired audio distribution is given, we sample from it. - if rng is None: - rng = random.Random() - nprng = np.random.RandomState(rng.getrandbits(32)) - prob_masses = distr.values() - prob_masses /= np.sum(prob_masses) - tag = nprng.choice(distr.keys(), p=prob_masses) - indices = self.duration_to_list_per_tag[tag][bin_key] - if len(indices) == 0: - return False, None - else: - if rng is None: - rng = random.Random() - # duration, file size and relative path from root - s = self.idx_to_record[rng.sample(indices, 1)[0]] - s = (s[0], s[1], os.path.join(self.audio_dir, s[2])) - return True, s diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index c0a70ad1..abe1a0ec 100755 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -6,11 +6,6 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor -from data_utils.augmentor.resamler import ResamplerAugmentor -from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor -from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor -from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor -from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor class AugmentationPipeline(object): @@ -81,15 +76,5 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) - if augmentor_type == "resamle": - return ResamplerAugmentor(self._rng, **params) - if augmentor_type == "speed": - return SpeedPerturbatioAugmentor(self._rng, **params) - if augmentor_type == "online_bayesian_normalization": - return OnlineBayesianNormalizationAugmentor(self._rng, **params) - if augmentor_type == "Impulse_response": - return ImpulseResponseAugmentor(self._rng, **params) - if augmentor_type == "noise_speech": - return NoiseSpeechAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/implus_response.py b/data_utils/augmentor/implus_response.py deleted file mode 100755 index cc205342..00000000 --- a/data_utils/augmentor/implus_response.py +++ /dev/null @@ -1,76 +0,0 @@ -""" Impulse response""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base -from . import audio_database -from data_utils.speech import SpeechSegment - - -class ImpulseResponseAugmentor(base.AugmentorBase): - """ Instantiates an impulse response model - - :param ir_dir: directory containing impulse responses - :type ir_dir: basestring - :param tags: optional parameter for specifying what - particular impulse responses to apply. - :type tags: list - :parm tag_distr: optional noise distribution - :type tag_distr: dict - """ - - def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None): - # Define all required parameter maps here. - self.ir_dir = ir_dir - self.index_file = index_file - - self.tags = tags - self.tag_distr = tag_distr - - self.audio_index = audio_database.AudioIndex() - self.rng = rng - - def _init_data(self): - """ Preloads stuff from disk in an attempt (e.g. list of files, etc) - to make later loading faster. If the data configuration remains the - same, this function does nothing. - - """ - self.audio_index.refresh_records_from_index_file( - self.ir_dir, self.index_file, self.tags) - - def transform_audio(self, audio_segment): - """ Convolves the input audio with an impulse response. - - :param audio_segment: input audio - :type audio_segment: AudioSegemnt - """ - # This handles the cases where the data source or directories change. - self._init_data() - - read_size = 0 - tag_distr = self.tag_distr - if not self.audio_index.has_audio(tag_distr): - if tag_distr is None: - if not self.tags: - raise RuntimeError("The ir index does not have audio " - "files to sample from.") - else: - raise RuntimeError("The ir index does not have audio " - "files of the given tags to sample " - "from.") - else: - raise RuntimeError("The ir index does not have audio " - "files to match the target ir " - "distribution.") - else: - # Querying with a negative duration triggers the index to search - # from all impulse responses. - success, record = self.audio_index.sample_audio( - -1.0, rng=self.rng, distr=tag_distr) - if success is True: - _, read_size, ir_fname = record - ir_wav = SpeechSegment.from_file(ir_fname) - audio_segment.convolve(ir_wav, allow_resampling=True) diff --git a/data_utils/augmentor/noise_speech.py b/data_utils/augmentor/noise_speech.py deleted file mode 100755 index 8cf7c27b..00000000 --- a/data_utils/augmentor/noise_speech.py +++ /dev/null @@ -1,318 +0,0 @@ -""" noise speech -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import logging -import numpy as np -import os -from collections import defaultdict - -from . import base -from . import audio_database -from data_utils.speech import SpeechSegment - -TURK = "turk" -USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"]) -HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0 -FIND_NOISE_MAX_ATTEMPTS = 20 - -logger = logging.getLogger(__name__) - - -def get_first_smaller(items, value): - index = bisect.bisect_left(items, value) - 1 - assert items[index] < value, \ - 'get_first_smaller failed! %d %d' % (items[index], value) - return items[index] - - -def get_first_larger(items, value): - 'Find leftmost value greater than value' - index = bisect.bisect_right(items, value) - assert index < len(items), \ - "no noise bin exists for this audio length (%f)" % value - assert items[index] > value, \ - 'get_first_larger failed! %d %d' % (items[index], value) - return items[index] - - -def _get_turk_noise_files(noise_dir, index_file): - """ Creates a map from duration => a list of noise filenames - - :param noise_dir: Directory of noise files which contains - "noise-samples-list" - :type noise_dir: basestring - :param index_file: Noise list - :type index_file: basestring - - returns:noise_files (defaultdict): A map of bins to noise files. - Each key is the duration, and the value is a list of noise - files binned to this duration. Each bin is 2 secs. - - Note: noise-samples-list should contain one line per noise (wav) file - along with its duration in milliseconds - """ - noise_files = defaultdict(list) - if not os.path.exists(index_file): - logger.error('No noise files were found at {}'.format(index_file)) - return noise_files - num_noise_files = 0 - rounded_durations = list(range(0, 65, 2)) - with open(index_file, 'r') as fl: - for line in fl: - fname = os.path.join(noise_dir, line.strip().split()[0]) - duration = float(line.strip().split()[1]) / 1000 - # bin the noise files into length bins rounded by 2 sec - bin_id = get_first_smaller(rounded_durations, duration) - noise_files[bin_id].append(fname) - num_noise_files += 1 - logger.info('Loaded {} turk noise files'.format(num_noise_files)) - return noise_files - - -class NoiseSpeechAugmentor(base.AugmentorBase): - """ Noise addition block - - :param snr_min: minimum signal-to-noise ratio - :type snr_min: float - :param snr_max: maximum signal-to-noise ratio - :type snr_max: float - :param noise_dir: root of where noise files are stored - :type noise_fir: basestring - :param index_file: index of noises of interest in noise_dir - :type index_file: basestring - :param source: select one from - - turk - - freesound - - chime - Note that this field is no longer required for the freesound - and chime - :type source: string - :param tags: optional parameter for specifying what - particular noises we want to add. See above for the available tags. - :type tags: list - :param tag_distr: optional noise distribution - :type tag_distr: dict - """ - - def __init__(self, - rng, - snr_min, - snr_max, - noise_dir, - source, - allow_downsampling=None, - index_file=None, - tags=None, - tag_distr=None): - # Define all required parameter maps here. - self.rng = rng - self.snr_min = snr_min - self.snr_max = snr_max - self.noise_dir = noise_dir - self.source = source - - self.allow_downsampling = allow_downsampling - self.index_file = index_file - self.tags = tags - self.tag_distr = tag_distr - - # When new noise sources are added, make sure to define the - # associated bookkeeping variables here. - self.turk_noise_files = [] - self.turk_noise_dir = None - self.audio_index = audio_database.AudioIndex() - - def _init_data(self): - """ Preloads stuff from disk in an attempt (e.g. list of files, etc) - to make later loading faster. If the data configuration remains the - same, this function does nothing. - - """ - noise_dir = self.noise_dir - index_file = self.index_file - source = self.source - if not index_file: - if source == TURK: - index_file = os.path.join(noise_dir, 'noise-samples-list') - logger.debug("index_file not provided; " + "defaulting to " + - index_file) - else: - if source != "": - assert source in USE_AUDIO_DATABASE_SOURCES, \ - "{} not supported by audio_database".format(source) - index_file = os.path.join(noise_dir, - "audio_index_commercial.txt") - logger.debug("index_file not provided; " + "defaulting to " + - index_file) - - if source == TURK: - if self.turk_noise_dir != noise_dir: - self.turk_noise_dir = noise_dir - self.turk_noise_files = _get_turk_noise_files(noise_dir, - index_file) - # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: - else: - if source != "": - assert source in USE_AUDIO_DATABASE_SOURCES, \ - "{} not supported by audio_database".format(source) - self.audio_index.refresh_records_from_index_file( - self.noise_dir, index_file, self.tags) - - def transform_audio(self, audio_segment): - """Adds walla noise - - :param audio_segment: Input audio - :type audio_segment: SpeechSegment - """ - # This handles the cases where the data source or directories change. - self._init_data - source = self.source - allow_downsampling = self.allow_downsampling - if source == TURK: - self._add_turk_noise(audio_segment, self.rng, allow_downsampling) - # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: - else: - self._add_noise(audio_segment, self.rng, allow_downsampling) - - def _sample_snr(self): - """ Returns a float sampled in [`self.snr_min`, `self.snr_max`] - if both `self.snr_min` and `self.snr_max` are non-zero. - """ - snr_min = self.snr_min - snr_max = self.snr_max - sampled_snr = self.rng.uniform(snr_min, snr_max) - return sampled_snr - - def _add_turk_noise(self, audio_segment, allow_downsampling): - """ Adds a turk noise to the input audio. - - :param audio_segment: input audio - :type audio_segment: audiosegment - :param allow_downsampling: indicates whether downsampling - is allowed - :type allow_downsampling: boolean - """ - read_size = 0 - if len(self.turk_noise_files) > 0: - snr = self._sample_snr(self.rng) - # Draw the noise file randomly from noise files that are - # slightly longer than the utterance - noise_bins = sorted(self.turk_noise_files.keys()) - # note some bins can be empty, so we can't just round up - # to the nearest 2-sec interval - rounded_duration = get_first_larger(noise_bins, - audio_segment.duration) - noise_fname = \ - self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0] - noise = SpeechSegment.from_wav_file(noise_fname) - logger.debug('noise_fname {}'.format(noise_fname)) - logger.debug('snr {}'.format(snr)) - read_size = len(noise) * 2 - # May throw exceptions, but this is caught by - # AudioFeaturizer.get_audio_files. - audio_segment.add_noise( - noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) - - def _add_noise(self, audio_segment, allow_downsampling): - """ Adds a noise indexed in audio_database.AudioIndex. - - :param audio_segment: input audio - :type audio_segment: SpeechSegment - :param allow_downsampling: indicates whether downsampling - is allowed - :type allow_downsampling: boolean - - Returns: - (SpeechSegment, int) - - sound with turk noise added - - number of bytes read from disk - """ - read_size = 0 - tag_distr = self.tag_distr - if not self.audio_index.has_audio(tag_distr): - if tag_distr is None: - if not self.tags: - raise RuntimeError("The noise index does not have audio " - "files to sample from.") - else: - raise RuntimeError("The noise index does not have audio " - "files of the given tags to sample " - "from.") - else: - raise RuntimeError("The noise index does not have audio " - "files to match the target noise " - "distribution.") - else: - # Compute audio segment related statistics - audio_duration = audio_segment.duration - - # Sample relevant augmentation parameters. - snr = self._sample_snr(self.rng) - - # Perhaps, we may not have a sufficiently long noise, so we need - # to search iteratively. - min_duration = audio_duration + 0.25 - for _ in range(FIND_NOISE_MAX_ATTEMPTS): - logger.debug("attempting to find noise of length " - "at least {}".format(min_duration)) - - success, record = \ - self.audio_index.sample_audio(min_duration, - rng=self.rng, - distr=tag_distr) - - if success is True: - noise_duration, read_size, noise_fname = record - - # Assert after logging so we know - # what caused augmentation to fail. - logger.debug("noise_fname {}".format(noise_fname)) - logger.debug("snr {}".format(snr)) - assert noise_duration >= min_duration - break - - # Decrease the desired minimum duration linearly. - # If the value becomes smaller than some threshold, - # we half the value instead. - if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD: - min_duration -= 2.0 - else: - min_duration *= 0.5 - - if success is False: - logger.info("Failed to find a noise file") - return - - diff_duration = audio_duration + 0.25 - noise_duration - if diff_duration >= 0.0: - # Here, the noise is shorter than the audio file, so - # we pad with zeros to make sure the noise sound is applied - # with a uniformly random shift. - noise = SpeechSegment.from_file(noise_fname) - noise = noise.pad_silence(diff_duration, sides="both") - else: - # The noise clip is at least ~25 ms longer than the audio - # segment here. - diff_duration = int(noise_duration * audio_segment.sample_rate) - \ - int(audio_duration * audio_segment.sample_rate) - \ - int(0.02 * audio_segment.sample_rate) - start = float(self.rng.randint(0, diff_duration)) / \ - audio.sample_rate - finish = min(start + audio_duration + 0.2, noise_duration) - noise = SpeechSegment.slice_from_file(noise_fname, start, - finish) - - if len(noise) < len(audio_segment): - # This is to ensure that the noise clip is at least as - # long as the audio segment. - num_samples_to_pad = len(audio_segment) - len(noise) - # Padding this amount of silence on both ends ensures that - # the placement of the noise clip is uniformly random. - silence = SpeechSegment( - np.zeros(num_samples_to_pad), audio_segment.sample_rate) - noise = SpeechSegment.concatenate(silence, noise, silence) - - audio_segment.add_noise( - noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py deleted file mode 100755 index bc2d6c1b..00000000 --- a/data_utils/augmentor/online_bayesian_normalization.py +++ /dev/null @@ -1,57 +0,0 @@ -""" Online bayesian normalization -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class OnlineBayesianNormalizationAugmentor(base.AugmentorBase): - """ - Instantiates an online bayesian normalization module. - :param target_db: Target RMS value in decibels - :type target_db: func[int->scalar] - :param prior_db: Prior RMS estimate in decibels - :type prior_db: func[int->scalar] - :param prior_samples: Prior strength in number of samples - :type prior_samples: func[int->scalar] - :param startup_delay: Start-up delay in seconds during - which normalization statistics is accrued. - :type starup_delay: func[int->scalar] - """ - - def __init__(self, - rng, - target_db, - prior_db, - prior_samples, - startup_delay=base.parse_parameter_from(0.0)): - - self.target_db = target_db - self.prior_db = prior_db - self.prior_samples = prior_samples - self.startup_delay = startup_delay - self.rng = rng - - def transform_audio(self, audio_segment): - """ - Normalizes the input audio using the online Bayesian approach. - - :param audio_segment: input audio - :type audio_segment: SpeechSegment - :param iteration: current iteration - :type iteration: int - :param text: audio transcription - :type text: basestring - :param rng: RNG to use for augmentation - :type rng: random.Random - - """ - read_size = 0 - target_db = self.target_db(iteration) - prior_db = self.prior_db(iteration) - prior_samples = self.prior_samples(iteration) - startup_delay = self.startup_delay(iteration) - audio.normalize_online_bayesian( - target_db, prior_db, prior_samples, startup_delay=startup_delay) diff --git a/data_utils/augmentor/resampler.py b/data_utils/augmentor/resampler.py deleted file mode 100755 index 1b959be5..00000000 --- a/data_utils/augmentor/resampler.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class ResamplerAugmentor(base.AugmentorBase): - """ Instantiates a resampler module. - - :param new_sample_rate: New sample rate in Hz - :type new_sample_rate: func[int->scalar] - :param rng: Random generator object. - :type rng: random.Random - """ - - def __init__(self, rng, new_sample_rate): - self.new_sample_rate = new_sample_rate - self._rng = rng - - def transform_audio(self, audio_segment): - """ Resamples the input audio to the target sample rate. - - Note that this is an in-place transformation. - - :param audio: input audio - :type audio: SpeechDLSegment - """ - new_sample_rate = self.new_sample_rate - audio.resample(new_sample_rate) \ No newline at end of file diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py deleted file mode 100755 index e09be5f7..00000000 --- a/data_utils/augmentor/speed_perturb.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Speed perturbation module for making ASR robust to different voice -types (high pitched, low pitched, etc) -Samples uniformly between speed_min and speed_max -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class SpeedPerturbatioAugmentor(base.AugmentorBase): - """ - Instantiates a speed perturbation module. - - See reference paper here: - - http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf - - :param speed_min: Lower bound on new rate to sample - :type speed_min: func[int->scalar] - :param speed_max: Upper bound on new rate to sample - :type speed_max: func[int->scalar] - """ - - def __init__(self, rng, speed_min, speed_max): - - if (speed_min < 0.9): - raise ValueError( - "Sampling speed below 0.9 can cause unnatural effects") - if (speed_min > 1.1): - raise ValueError( - "Sampling speed above 1.1 can cause unnatural effects") - self.speed_min = speed_min - self.speed_max = speed_max - self.rng = rng - - def transform_audio(self, audio_segment): - """ - Samples a new speed rate from the given range and - changes the speed of the given audio clip. - - Note that this is an in-place transformation. - - :param audio_segment: input audio - :type audio_segment: SpeechDLSegment - """ - read_size = 0 - speed_min = self.speed_min(iteration) - speed_max = self.speed_max(iteration) - sampled_speed = rng.uniform(speed_min, speed_max) - audio = audio.change_speed(sampled_speed) diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py index 15055b91..a5a9f6ca 100755 --- a/data_utils/augmentor/volume_perturb.py +++ b/data_utils/augmentor/volume_perturb.py @@ -3,10 +3,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from . import base +from data_utils.augmentor.base import AugmentorBase -class VolumePerturbAugmentor(base.AugmentorBase): +class VolumePerturbAugmentor(AugmentorBase): """Augmentation model for adding random volume perturbation. This is used for multi-loudness training of PCEN. See diff --git a/requirements.txt b/requirements.txt index 58a93deb..c37e88ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ SoundFile==0.9.0.post1 wget==3.2 +scikits.samplerate==0.3.3 +scipy==0.13.0b1 From d1ee10be102263da5fbfac1e131c31ed605b5ad0 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Fri, 16 Jun 2017 18:29:56 +0800 Subject: [PATCH 28/55] modify audio and speech --- data_utils/audio.py | 14 ++++++++------ data_utils/speech.py | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index ee4e6d84..066437dc 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -104,7 +104,8 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) - def concatenate(self, *segments): + @classmethod + def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. :param *segments: Input audio segments @@ -123,11 +124,11 @@ class AudioSegment(object): if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " "different sample rates") - if type(seg) is not type(self): + if type(seg) is not cls: raise TypeError("Only audio segments of the same type " "instance can be concatenated.") samples = np.concatenate([seg.samples for seg in segments]) - return type(self)(samples, sample_rate) + return cls(samples, sample_rate) def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -355,13 +356,14 @@ class AudioSegment(object): """ if duration == 0.0: return self + cls = type(self) silence = self.make_silence(duration, self._sample_rate) if sides == "beginning": - padded = self.concatenate(silence, self) + padded = cls.concatenate(silence, self) elif sides == "end": - padded = self.concatenate(self, silence) + padded = cls.concatenate(self, silence) elif sides == "both": - padded = self.concatenate(silence, self, silence) + padded = cls.concatenate(silence, self, silence) else: raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples diff --git a/data_utils/speech.py b/data_utils/speech.py index 48db595b..5d1fc15a 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -65,6 +65,32 @@ class SpeechSegment(AudioSegment): audio = AudioSegment.from_bytes(bytes) return cls(audio.samples, audio.sample_rate, transcript) + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: Input speech segments + :type *segments: SpeechSegment + :return: Speech segment instance. + :rtype: SpeechSegment + :raises ValueError: If number of segments is zero, or if sample_rate + not match between two audio segments + :raises TypeError: If item of segments is not Audiosegment instance + """ + # Perform basic sanity-checks. + if len(segments) == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only speech segments of the same type " + "instance can be concatenated.") + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate, seg._transcript) + @property def transcript(self): """Return the transcript text. From 5ca270d30a34c71b0b851ed376fb7e7d90b3cf17 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sat, 17 Jun 2017 09:03:18 +0800 Subject: [PATCH 29/55] add audio file --- data_utils/audio.py | 245 ++++++++++++++++++++----------------------- data_utils/speech.py | 55 ++++++++-- 2 files changed, 161 insertions(+), 139 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 066437dc..1f75da8a 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -47,32 +47,6 @@ class AudioSegment(object): """Return whether two objects are unequal.""" return not self.__eq__(other) - def __len__(self): - """Returns length of segment in samples.""" - return self.num_samples - - def __add__(self, other): - """Add samples from another segment to those of this segment and return - a new segment (sample-wise addition, not segment concatenation). - - :param other: Segment containing samples to be - added in. - :type other: AudioSegment - :return: New segment containing resulting samples. - :rtype: AudioSegment - :raise TypeError: If sample rates of segments don't match, - or if length of segments don't match. - """ - if type(self) != type(other): - raise TypeError("Cannot add segment of different type: {}" - .format(type(other))) - if self._sample_rate != other._sample_rate: - raise TypeError("Sample rates must match to add segments.") - if len(self._samples) != len(other._samples): - raise TypeError("Segment lengths must match to add segments.") - samples = self.samples + other.samples - return type(self)(samples, sample_rate=self._sample_rate) - def __str__(self): """Return human-readable representation of segment.""" return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " @@ -108,13 +82,13 @@ class AudioSegment(object): def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: Input audio segments + :param *segments: Input audio segments. :type *segments: AudioSegment - :return: Audio segment instance. + :return: Audio segment instance as concatenating results. :rtype: AudioSegment - :raises ValueError: If number of segments is zero, or if sample_rate - not match between two audio segments - :raises TypeError: If item of segments is not Audiosegment instance + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If every segment in is not Audiosegment instance. """ # Perform basic sanity-checks. if len(segments) == 0: @@ -155,12 +129,13 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) - def slice_from_file(self, file, start=None, end=None): + @classmethod + def slice_from_file(cls, file, start=None, end=None): """Loads a small section of an audio without having to load the entire file into the memory which can be incredibly wasteful. - :param file: Input audio filepath - :type file: basestring + :param file: Input audio filepath or file object. + :type file: basestring|file :param start: Start time in seconds. If start is negative, it wraps around from the end. If not provided, this function reads from the very beginning. @@ -169,9 +144,11 @@ class AudioSegment(object): from the end. If not provided, the default behvaior is to read to the end of the file. :type end: float - :return: The specified slice of input audio in the audio.AudioSegment format. + :return: AudioSegment instance of the specified slice of the input + audio file. :rtype: AudioSegment - :rainse ValueError: If the position is error, or if the time is out bounds. + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. """ sndfile = soundfile.SoundFile(file) sample_rate = sndfile.samplerate @@ -184,40 +161,60 @@ class AudioSegment(object): end += duration if start < 0.0: raise ValueError("The slice start position (%f s) is out of " - "bounds. Filename: %s" % (start, file)) + "bounds." % start) if end < 0.0: - raise ValueError("The slice end position (%f s) is out of bounds " - "Filename: %s" % (end, file)) + raise ValueError("The slice end position (%f s) is out of bounds." % + end) if start > end: raise ValueError("The slice start position (%f s) is later than " "the slice end position (%f s)." % (start, end)) if end > duration: - raise ValueError("The slice end time (%f s) is out of bounds " - "(> %f s) Filename: %s" % (end, duration, file)) + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) start_frame = int(start * sample_rate) end_frame = int(end * sample_rate) sndfile.seek(start_frame) data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - return type(self)(data, sample_rate) + return cls(data, sample_rate) - def make_silence(self, duration, sample_rate): + @classmethod + def make_silence(cls, duration, sample_rate): """Creates a silent audio segment of the given duration and sample rate. - :param duration: Length of silence in seconds + :param duration: Length of silence in seconds. :type duration: float - :param sample_rate: Sample rate + :param sample_rate: Sample rate. :type sample_rate: float - :return: Silence of the given duration + :return: Silent AudioSegment instance of the given duration. :rtype: AudioSegment """ samples = np.zeros(int(duration * sample_rate)) - return type(self)(samples, sample_rate) + return cls(samples, sample_rate) + + def superimposed(self, other): + """Add samples from another segment to those of this segment + (sample-wise addition, not segment concatenation). + + :param other: Segment containing samples to be added in. + :type other: AudioSegments + :raise TypeError: If type of two segments don't match. + :raise ValueError: If the sample_rate of two segments not equal, or if + the length of segments don't match. + """ + if type(self) != type(other): + raise TypeError("Cannot add segments of different types: %s " + "and %s." % (type(self), type(other))) + if self._sample_rate != other._sample_rate: + raise ValueError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise ValueError("Segment lengths must match to add segments.") + self._samples += other._samples def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - :param dtype: Data type for export samples. Options: 'int16', 'int32', + :param dtype: Data type for export samples. Options: 'int16','int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str :return: Byte string containing audio content. @@ -258,16 +255,17 @@ class AudioSegment(object): self._samples = np.interp(new_indices, old_indices, self._samples) def normalize(self, target_db=-20, max_gain_db=300.0): - """Normalize audio to be desired RMS value in decibels. + """Normalize audio to be of the desired RMS value in decibels. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels. This value should - be less than 0.0 as 0.0 is full-scale audio. + :param target_db: Target RMS value in decibels. This value should be + less than 0.0 as 0.0 is full-scale audio. :type target_db: float :param max_gain_db: Max amount of gain in dB that can be applied for - normalization. This is to prevent nans when attempting - to normalize a signal consisting of all zeros. + normalization. This is to prevent nans when + attempting to normalize a signal consisting of + all zeros. :type max_gain_db: float :raises ValueError: If the required gain to normalize the segment to the target_db value exceeds max_gain_db. @@ -275,9 +273,9 @@ class AudioSegment(object): gain = target_db - self.rms_db if gain > max_gain_db: raise ValueError( - "Unable to normalize segment to %f dB because it has an RMS " - "value of %f dB and the difference exceeds max_gain_db (%f dB)" - % (target_db, self.rms_db, max_gain_db)) + "Unable to normalize segment to %f dB because the " + "the probable gain have exceeds max_gain_db (%f dB)" % + (target_db, max_gain_db)) self.apply_gain(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, @@ -285,30 +283,30 @@ class AudioSegment(object): prior_db, prior_samples, startup_delay=0.0): - """Normalize audio using a production-compatible online/causal algorithm. - This uses an exponential likelihood and gamma prior to make online estimates - of the RMS even when there are very few samples. + """Normalize audio using a production-compatible online/causal + algorithm. This uses an exponential likelihood and gamma prior to + make online estimates of the RMS even when there are very few samples. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels + :param target_db: Target RMS value in decibels. :type target_bd: float - :param prior_db: Prior RMS estimate in decibels + :param prior_db: Prior RMS estimate in decibels. :type prior_db: float - :param prior_samples: Prior strength in number of samples + :param prior_samples: Prior strength in number of samples. :type prior_samples: float - :param startup_delay: Default 0.0 s. If provided, this function will accrue - statistics for the first startup_delay seconds before - applying online normalization. + :param startup_delay: Default 0.0 s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. :type startup_delay: float """ - # Estimate total RMS online + # Estimate total RMS online. startup_sample_idx = min(self.num_samples - 1, int(self.sample_rate * startup_delay)) prior_mean_squared = 10.**(prior_db / 10.) prior_sum_of_squares = prior_mean_squared * prior_samples cumsum_of_squares = np.cumsum(self.samples**2) - sample_count = np.arange(len(self)) + 1 + sample_count = np.arange(len(self.num_samples)) + 1 if startup_sample_idx > 0: cumsum_of_squares[:startup_sample_idx] = \ cumsum_of_squares[startup_sample_idx] @@ -317,42 +315,40 @@ class AudioSegment(object): mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / (sample_count + prior_samples)) rms_estimate_db = 10 * np.log10(mean_squared_estimate) - # Compute required time-varying gain + # Compute required time-varying gain. gain_db = target_db - rms_estimate_db self.apply_gain(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): - """Resample audio segment. This resamples the audio to a new - sample rate. + """Resample the audio to a target sample rate. Note that this is an in-place transformation. - :param target_sample_rate: Target sample rate + :param target_sample_rate: Target sample rate. :type target_sample_rate: int :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. Sets resampling speed/quality tradeoff. See http://www.mega-nerd.com/SRC/api_misc.html#Converters - :type quality: basestring + :type quality: str """ resample_ratio = target_sample_rate / self._sample_rate - new_samples = scikits.samplerate.resample( + self._samples = scikits.samplerate.resample( self._samples, r=resample_ratio, type=quality) - self._samples = new_samples self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): - """Pads this audio sample with a period of silence. + """Pad this audio sample with a period of silence. Note that this is an in-place transformation. - :param duration: Length of silence in seconds to pad + :param duration: Length of silence in seconds to pad. :type duration: float - :param sides: Position for padding - 'beginning' - adds silence in the beginning - 'end' - adds silence in the end + :param sides: Position for padding: + 'beginning' - adds silence in the beginning; + 'end' - adds silence in the end; 'both' - adds silence in both the beginning and the end. :type sides: str - :raises ValueError: If the sides not surport + :raises ValueError: If sides is not supported. """ if duration == 0.0: return self @@ -367,51 +363,41 @@ class AudioSegment(object): else: raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples - self._sample_rate = padded._sample_rate def subsegment(self, start_sec=None, end_sec=None): """Return new AudioSegment containing audio between given boundaries. - :param start_sec: Beginning of subsegment in seconds, - (beginning of segment if None). + :param start_sec: Beginning of subsegment in seconds. :type start_sec: float - :param end_sec: End of subsegment in seconds, - (end of segment if None). + :param end_sec: End of subsegment in seconds. :type end_sec: float - :return: New AudioSegment containing specified subsegment. - :rtype: AudioSegment """ start_sec = 0.0 if start_sec is None else start_sec end_sec = self.duration if end_sec is None else end_sec - # negative boundaries are relative to end of segment if start_sec < 0.0: start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) - samples = self._samples[start_sample:end_sample] - return type(self)(samples, sample_rate=self._sample_rate) + self._samples = self._samples[start_sample:end_sample] def random_subsegment(self, subsegment_length, rng=None): """Return a random subsegment of a specified length in seconds. :param subsegment_length: Subsegment length in seconds. :type subsegment_length: float - :param rng: Random number generator state + :param rng: Random number generator state. :type rng: random.Random - :return: New AudioSegment containing random subsegment - of original segment - :rtype: AudioSegment - :raises ValueError: If the length of subsegment greater than origineal - segemnt. + :raises ValueError: If the length of subsegment greater than + origineal segemnt. """ rng = random.Random() if rng is None else rng if subsegment_length > self.duration: raise ValueError("Length of subsegment must not be greater " "than original segment.") start_time = rng.uniform(0.0, self.duration - subsegment_length) - return self.subsegment(start_time, start_time + subsegment_length) + self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given filter. @@ -420,10 +406,10 @@ class AudioSegment(object): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment - :param allow_resample: indicates whether resampling is allowed when - the impulse_segment has a different sample - rate from this signal. - :type allow_resample: boolean + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool :raises ValueError: If the sample rate is not match between two audio segments and resample is not allowed. """ @@ -443,9 +429,10 @@ class AudioSegment(object): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment - :param allow_resample: indicates whether resampling is allowed when - the impulse_segment has a different sample rate from this signal. - :type allow_resample: boolean + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool """ target_db = self.rms_db self.convolve(impulse_segment, allow_resample=allow_resample) @@ -465,42 +452,36 @@ class AudioSegment(object): :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. :type snr_dB: float - :param allow_downsampling: whether to allow the noise signal to be downsampled - to match the base signal sample rate. - :type allow_downsampling: boolean - :param max_gain_db: Maximum amount of gain to apply to noise signal before - adding it in. This is to prevent attempting to apply infinite - gain to a zero signal. + :param allow_downsampling: Whether to allow the noise signal to be + downsampled to match the base signal sample + rate. + :type allow_downsampling: bool + :param max_gain_db: Maximum amount of gain to apply to noise signal + before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. :type max_gain_db: float :param rng: Random number generator state. - :type rng: random.Random - :raises ValueError: If the sample rate does not match between the two audio segments - and resample is not allowed, or if the duration of noise segments - is shorter than original audio segments. + :type rng: None|random.Random + :raises ValueError: If the sample rate does not match between the two + audio segments and resample is not allowed, or if + the duration of noise segments is shorter than + original audio segments. """ rng = random.Random() if rng is None else rng if allow_downsampling and noise.sample_rate > self.sample_rate: noise = noise.resample(self.sample_rate) if noise.sample_rate != self.sample_rate: - raise ValueError("Noise sample rate (%d Hz) is not equal to " - "base signal sample rate (%d Hz)." % - (noise.sample_rate, self.sample_rate)) + raise ValueError("Noise sample rate (%d Hz) is not equal to base " + "signal sample rate (%d Hz)." % (noise.sample_rate, + self.sample_rate)) if noise.duration < self.duration: - raise ValueError("Noise signal (%f sec) must be at " - "least as long as base signal (%f sec)." % + raise ValueError("Noise signal (%f sec) must be at least as long as" + " base signal (%f sec)." % (noise.duration, self.duration)) - noise_gain_db = self.rms_db - noise.rms_db - snr_dB - noise_gain_db = min(max_gain_db, noise_gain_db) - noise_subsegment = noise.random_subsegment(self.duration, rng=rng) - output = self + self.tranform_noise(noise_subsegment, noise_gain_db) - self._samples = output._samples - self._sample_rate = output._sample_rate - - def tranform_noise(self, noise_subsegment, noise_gain_db): - """ tranform noise file - """ - return type(self)(noise_subsegment._samples * (10.**( - noise_gain_db / 20.)), noise_subsegment._sample_rate) + noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) + noise.random_subsegment(self.duration, rng=rng) + noise.apply_gain(noise_gain_db) + self.superimposed(noise) @property def samples(self): @@ -571,7 +552,7 @@ class AudioSegment(object): Audio sample type is usually integer or float-point. For integer type, float32 will be rescaled from [-1, 1] to the maximum range supported by the integer type. - + This is for writing a audio file. """ dtype = np.dtype(dtype) diff --git a/data_utils/speech.py b/data_utils/speech.py index 5d1fc15a..443df68c 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -67,20 +67,20 @@ class SpeechSegment(AudioSegment): @classmethod def concatenate(cls, *segments): - """Concatenate an arbitrary number of audio segments together. + """Concatenate an arbitrary number of speech segments together. - :param *segments: Input speech segments + :param *segments: Input speech segments. :type *segments: SpeechSegment :return: Speech segment instance. :rtype: SpeechSegment - :raises ValueError: If number of segments is zero, or if sample_rate - not match between two audio segments - :raises TypeError: If item of segments is not Audiosegment instance + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If every segment in is not Audiosegment instance. """ - # Perform basic sanity-checks. if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") sample_rate = segments[0]._sample_rate + transcripts = "" for seg in segments: if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " @@ -88,8 +88,49 @@ class SpeechSegment(AudioSegment): if type(seg) is not cls: raise TypeError("Only speech segments of the same type " "instance can be concatenated.") + transcripts += seg._transcript samples = np.concatenate([seg.samples for seg in segments]) - return cls(samples, sample_rate, seg._transcript) + return cls(samples, sample_rate, transcripts) + + @classmethod + def slice_from_file(cls, filepath, start=None, end=None, transcript=""): + """Loads a small section of an speech without having to load + the entire file into the memory which can be incredibly wasteful. + + :param filepath: Filepath or file object to audio file. + :type filepath: basestring|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :param transcript: Transcript text for the speech. if not provided, + the defaults is an empty string. + :type transript: basestring + :return: SpeechSegment instance of the specified slice of the input + speech file. + :rtype: SpeechSegment + """ + audio = Audiosegment.slice_from_file(filepath, start, end) + return cls(audio.samples, audio.sample_rate, transcripts) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent speech segment of the given duration and + sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silence of the given duration. + :rtype: AudioSegment + """ + audio = AudioSegment.make_silence(duration, sample_rate) + return cls(audio.samples, audio.sample_rate, "") @property def transcript(self): From 26eb54eb37e0515f863243c133fe0a72bfd5c6af Mon Sep 17 00:00:00 2001 From: yangyaming Date: Sun, 18 Jun 2017 14:31:57 +0800 Subject: [PATCH 30/55] Follow comments. --- error_rate.py | 16 ++++++++------- tests/test_error_rate.py | 44 +++++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/error_rate.py b/error_rate.py index 2bb63711..08fe1255 100644 --- a/error_rate.py +++ b/error_rate.py @@ -1,9 +1,11 @@ # -*- coding: utf-8 -*- -""" - This module provides functions to calculate error rate in different level. - e.g. wer for word-level, cer for char-level. +"""This module provides functions to calculate error rate in different level. +e.g. wer for word-level, cer for char-level. """ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import numpy as np @@ -42,8 +44,7 @@ def levenshtein_distance(ref, hyp): def wer(reference, hypothesis, ignore_case=False, delimiter=' '): - """ - Calculate word error rate (WER). WER compares reference text and + """Calculate word error rate (WER). WER compares reference text and hypothesis text in word-level. WER is defined as: .. math:: @@ -71,6 +72,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): :type delimiter: char :return: Word error rate. :rtype: float + :raises ValueError: If reference length is zero. """ if ignore_case == True: reference = reference.lower() @@ -88,8 +90,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): def cer(reference, hypothesis, ignore_case=False): - """ - Calculate charactor error rate (CER). CER compares reference text and + """Calculate charactor error rate (CER). CER compares reference text and hypothesis text in char-level. CER is defined as: .. math:: @@ -117,6 +118,7 @@ def cer(reference, hypothesis, ignore_case=False): :type ignore_case: bool :return: Character error rate. :rtype: float + :raises ValueError: If reference length is zero. """ if ignore_case == True: reference = reference.lower() diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py index bb6dca30..57a6ccd6 100644 --- a/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -1,29 +1,63 @@ # -*- coding: utf-8 -*- +"""Test error rate.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + import unittest -import sys -sys.path.append('..') import error_rate class TestParse(unittest.TestCase): - def test_wer(self): + def test_wer_1(self): ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night' word_error_rate = error_rate.wer(ref, hyp) self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6) - def test_cer_en(self): + def test_wer_2(self): + ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' + word_error_rate = error_rate.wer(ref, ref) + self.assertEqual(word_error_rate, 0.0) + + def test_wer_3(self): + ref = ' ' + hyp = 'Hypothesis sentence' + try: + word_error_rate = error_rate.wer(ref, hyp) + except Exception as e: + self.assertTrue(isinstance(e, ValueError)) + + def test_cer_1(self): ref = 'werewolf' hyp = 'weae wolf' char_error_rate = error_rate.cer(ref, hyp) self.assertTrue(abs(char_error_rate - 0.25) < 1e-6) - def test_cer_zh(self): + def test_cer_2(self): + ref = 'werewolf' + char_error_rate = error_rate.cer(ref, ref) + self.assertEqual(char_error_rate, 0.0) + + def test_cer_3(self): ref = u'我是中国人' hyp = u'我是 美洲人' char_error_rate = error_rate.cer(ref, hyp) self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) + def test_cer_4(self): + ref = u'我是中国人' + char_error_rate = error_rate.cer(ref, ref) + self.assertFalse(char_error_rate, 0.0) + + def test_cer_5(self): + ref = '' + hyp = 'Hypothesis' + try: + char_error_rate = error_rate.cer(ref, hyp) + except Exception as e: + self.assertTrue(isinstance(e, ValueError)) + if __name__ == '__main__': unittest.main() From b8341da63dfa2baccff73c197e0e3dae336ef4de Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 16:23:30 +0800 Subject: [PATCH 31/55] add audio augmentation --- data_utils/audio.py | 3 ++- data_utils/speech.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 1f75da8a..3c671b69 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -88,7 +88,8 @@ class AudioSegment(object): :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every segment in is not Audiosegment instance. + :raises TypeError: If every item in segments is not Audiosegment + instance. """ # Perform basic sanity-checks. if len(segments) == 0: diff --git a/data_utils/speech.py b/data_utils/speech.py index 443df68c..66f22b24 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -75,7 +75,8 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every segment in is not Audiosegment instance. + :raises TypeError: If every item in segments is not Audiosegment + instance. """ if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") From 107f8b89ae5f961748b89dfe1153cf4ef0288c6b Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 16:47:09 +0800 Subject: [PATCH 32/55] add audio augmentation --- data_utils/audio.py | 6 +++--- data_utils/speech.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 3c671b69..1ad20bf3 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -88,7 +88,7 @@ class AudioSegment(object): :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every item in segments is not Audiosegment + :raises TypeError: If every item in segments is not AudioSegment instance. """ # Perform basic sanity-checks. @@ -296,7 +296,7 @@ class AudioSegment(object): :type prior_db: float :param prior_samples: Prior strength in number of samples. :type prior_samples: float - :param startup_delay: Default 0.0 s. If provided, this function will + :param startup_delay: Default 0.0s. If provided, this function will accrue statistics for the first startup_delay seconds before applying online normalization. :type startup_delay: float @@ -401,7 +401,7 @@ class AudioSegment(object): self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): - """Convolve this audio segment with the given filter. + """Convolve this audio segment with the given impulse_segment. Note that this is an in-place transformation. diff --git a/data_utils/speech.py b/data_utils/speech.py index 66f22b24..94ead1e8 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -75,11 +75,11 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every item in segments is not Audiosegment + :raises TypeError: If every item in segments is not SpeechSegment instance. """ if len(segments) == 0: - raise ValueError("No audio segments are given to concatenate.") + raise ValueError("No speech segments are given to concatenate.") sample_rate = segments[0]._sample_rate transcripts = "" for seg in segments: @@ -116,7 +116,7 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment """ audio = Audiosegment.slice_from_file(filepath, start, end) - return cls(audio.samples, audio.sample_rate, transcripts) + return cls(audio.samples, audio.sample_rate, transcript) @classmethod def make_silence(cls, duration, sample_rate): @@ -128,7 +128,7 @@ class SpeechSegment(AudioSegment): :param sample_rate: Sample rate. :type sample_rate: float :return: Silence of the given duration. - :rtype: AudioSegment + :rtype: SpeechSegment """ audio = AudioSegment.make_silence(duration, sample_rate) return cls(audio.samples, audio.sample_rate, "") From 21161b01653b98ea18903ff5bee07a127eee643d Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 17:11:58 +0800 Subject: [PATCH 33/55] add audio file --- data_utils/audio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 1ad20bf3..fd1f93df 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -87,7 +87,7 @@ class AudioSegment(object): :return: Audio segment instance as concatenating results. :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the - sample_rate of any two segments does not match. + sample_rate of any two segment does not match. :raises TypeError: If every item in segments is not AudioSegment instance. """ @@ -412,7 +412,7 @@ class AudioSegment(object): rate from this signal. :type allow_resample: bool :raises ValueError: If the sample rate is not match between two - audio segments and resample is not allowed. + audio segments when resample is not allowed. """ if allow_resample and self.sample_rate != impulse_segment.sample_rate: impulse_segment = impulse_segment.resample(self.sample_rate) @@ -464,8 +464,8 @@ class AudioSegment(object): :param rng: Random number generator state. :type rng: None|random.Random :raises ValueError: If the sample rate does not match between the two - audio segments and resample is not allowed, or if - the duration of noise segments is shorter than + audio segments when downsampling is not allowed, or + if the duration of noise segments is shorter than original audio segments. """ rng = random.Random() if rng is None else rng From 25ce7ebe7b1029e823a9cdb758e808f6a0e0995e Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 18:22:48 +0800 Subject: [PATCH 34/55] add audio file --- data_utils/audio.py | 4 ++-- data_utils/speech.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index fd1f93df..37f4f0ba 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -82,8 +82,8 @@ class AudioSegment(object): def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: Input audio segments. - :type *segments: AudioSegment + :param *segments: Input audio segments to be concatenated. + :type *segments: tuple of AudioSegment :return: Audio segment instance as concatenating results. :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the diff --git a/data_utils/speech.py b/data_utils/speech.py index 94ead1e8..00190009 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -69,8 +69,8 @@ class SpeechSegment(AudioSegment): def concatenate(cls, *segments): """Concatenate an arbitrary number of speech segments together. - :param *segments: Input speech segments. - :type *segments: SpeechSegment + :param *segments: Input speech segments to be concatenated. + :type *segments: tuple of SpeechSegment :return: Speech segment instance. :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the From ddb2bdc1906223733dd5b1a2ad15a54492681f5b Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Mon, 19 Jun 2017 00:08:05 +0800 Subject: [PATCH 35/55] add audio file --- data_utils/audio.py | 64 ++++++++++++++++++++++++++++++-------------- data_utils/speech.py | 10 +++---- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 37f4f0ba..5d02feb6 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -9,6 +9,7 @@ import soundfile import scikits.samplerate from scipy import signal import random +import copy class AudioSegment(object): @@ -87,9 +88,8 @@ class AudioSegment(object): :return: Audio segment instance as concatenating results. :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the - sample_rate of any two segment does not match. - :raises TypeError: If every item in segments is not AudioSegment - instance. + sample_rate of any segments does not match. + :raises TypeError: If any segment is not AudioSegment instance. """ # Perform basic sanity-checks. if len(segments) == 0: @@ -101,7 +101,7 @@ class AudioSegment(object): "different sample rates") if type(seg) is not cls: raise TypeError("Only audio segments of the same type " - "instance can be concatenated.") + "can be concatenated.") samples = np.concatenate([seg.samples for seg in segments]) return cls(samples, sample_rate) @@ -180,8 +180,7 @@ class AudioSegment(object): @classmethod def make_silence(cls, duration, sample_rate): - """Creates a silent audio segment of the given duration and - sample rate. + """Creates a silent audio segment of the given duration and sample rate. :param duration: Length of silence in seconds. :type duration: float @@ -193,15 +192,17 @@ class AudioSegment(object): samples = np.zeros(int(duration * sample_rate)) return cls(samples, sample_rate) - def superimposed(self, other): + def superimpose(self, other): """Add samples from another segment to those of this segment (sample-wise addition, not segment concatenation). + Note that this is an in-place transformation. + :param other: Segment containing samples to be added in. :type other: AudioSegments :raise TypeError: If type of two segments don't match. - :raise ValueError: If the sample_rate of two segments not equal, or if - the length of segments don't match. + :raise ValueError: If the sample rates of the two segments are not + equal, or if the lengths of segments don't match. """ if type(self) != type(other): raise TypeError("Cannot add segments of different types: %s " @@ -215,7 +216,7 @@ class AudioSegment(object): def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - :param dtype: Data type for export samples. Options: 'int16','int32', + :param dtype: Data type for export samples. Options: 'int16', 'int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str :return: Byte string containing audio content. @@ -362,16 +363,20 @@ class AudioSegment(object): elif sides == "both": padded = cls.concatenate(silence, self, silence) else: - raise ValueError("Unknown value for the kwarg %s" % sides) + raise ValueError("Unknown value for the sides %s" % sides) self._samples = padded._samples def subsegment(self, start_sec=None, end_sec=None): - """Return new AudioSegment containing audio between given boundaries. + """Cut the AudioSegment between given boundaries. + + Note that this is an in-place transformation. :param start_sec: Beginning of subsegment in seconds. :type start_sec: float :param end_sec: End of subsegment in seconds. :type end_sec: float + :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out + of bounds in time. """ start_sec = 0.0 if start_sec is None else start_sec end_sec = self.duration if end_sec is None else end_sec @@ -379,19 +384,33 @@ class AudioSegment(object): start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec + if start_sec < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_sec) + if end_sec < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_sec) + if start_sec > end_sec: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_sec, end_sec)) + if end_sec > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_sec, self.duration)) start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) self._samples = self._samples[start_sample:end_sample] def random_subsegment(self, subsegment_length, rng=None): - """Return a random subsegment of a specified length in seconds. + """Cut the specified length of the audiosegment randomly. + + Note that this is an in-place transformation. :param subsegment_length: Subsegment length in seconds. :type subsegment_length: float :param rng: Random number generator state. :type rng: random.Random - :raises ValueError: If the length of subsegment greater than - origineal segemnt. + :raises ValueError: If the length of subsegment is greater than + the origineal segemnt. """ rng = random.Random() if rng is None else rng if subsegment_length > self.duration: @@ -401,7 +420,7 @@ class AudioSegment(object): self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): - """Convolve this audio segment with the given impulse_segment. + """Convolve this audio segment with the given impulse segment. Note that this is an in-place transformation. @@ -428,6 +447,8 @@ class AudioSegment(object): """Convolve and normalize the resulting audio segment so that it has the same average power as the input signal. + Note that this is an in-place transformation. + :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment :param allow_resample: Indicates whether resampling is allowed when @@ -445,10 +466,12 @@ class AudioSegment(object): allow_downsampling=False, max_gain_db=300.0, rng=None): - """Adds the given noise segment at a specific signal-to-noise ratio. + """Add the given noise segment at a specific signal-to-noise ratio. If the noise segment is longer than this segment, a random subsegment of matching length is sampled from it and used instead. + Note that this is an in-place transformation. + :param noise: Noise signal to add. :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. @@ -480,9 +503,10 @@ class AudioSegment(object): " base signal (%f sec)." % (noise.duration, self.duration)) noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) - noise.random_subsegment(self.duration, rng=rng) - noise.apply_gain(noise_gain_db) - self.superimposed(noise) + noise_new = copy.deepcopy(noise) + noise_new.random_subsegment(self.duration, rng=rng) + noise_new.apply_gain(noise_gain_db) + self.superimpose(noise_new) @property def samples(self): diff --git a/data_utils/speech.py b/data_utils/speech.py index 00190009..fc031ff4 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -67,7 +67,8 @@ class SpeechSegment(AudioSegment): @classmethod def concatenate(cls, *segments): - """Concatenate an arbitrary number of speech segments together. + """Concatenate an arbitrary number of speech segments together, both + audio and transcript will be concatenated. :param *segments: Input speech segments to be concatenated. :type *segments: tuple of SpeechSegment @@ -75,8 +76,7 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every item in segments is not SpeechSegment - instance. + :raises TypeError: If any segment is not SpeechSegment instance. """ if len(segments) == 0: raise ValueError("No speech segments are given to concatenate.") @@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment): return cls(samples, sample_rate, transcripts) @classmethod - def slice_from_file(cls, filepath, start=None, end=None, transcript=""): + def slice_from_file(cls, filepath, start=None, end=None, transcript): """Loads a small section of an speech without having to load the entire file into the memory which can be incredibly wasteful. @@ -121,7 +121,7 @@ class SpeechSegment(AudioSegment): @classmethod def make_silence(cls, duration, sample_rate): """Creates a silent speech segment of the given duration and - sample rate. + sample rate, transcript will be an empty string. :param duration: Length of silence in seconds. :type duration: float From def66a32235f8e2942ddaf9c60ebed5cb52b6bf9 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 11:31:34 +0800 Subject: [PATCH 36/55] Follow comments. --- error_rate.py | 18 ++++++++++++------ tests/test_error_rate.py | 8 ++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/error_rate.py b/error_rate.py index 08fe1255..0cf17921 100644 --- a/error_rate.py +++ b/error_rate.py @@ -2,14 +2,20 @@ """This module provides functions to calculate error rate in different level. e.g. wer for word-level, cer for char-level. """ - from __future__ import absolute_import from __future__ import division from __future__ import print_function + import numpy as np -def levenshtein_distance(ref, hyp): +def _levenshtein_distance(ref, hyp): + """Levenshtein distance is a string metric for measuring the difference between + two sequences. Informally, the levenshtein disctance is defined as the minimum + number of single-character edits (substitutions, insertions or deletions) + required to change one word into the other. We can naturally extend the edits to + word level when calculate levenshtein disctance for two sentences. + """ ref_len = len(ref) hyp_len = len(hyp) @@ -72,7 +78,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): :type delimiter: char :return: Word error rate. :rtype: float - :raises ValueError: If reference length is zero. + :raises ValueError: If the reference length is zero. """ if ignore_case == True: reference = reference.lower() @@ -84,7 +90,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): if len(ref_words) == 0: raise ValueError("Reference's word number should be greater than 0.") - edit_distance = levenshtein_distance(ref_words, hyp_words) + edit_distance = _levenshtein_distance(ref_words, hyp_words) wer = float(edit_distance) / len(ref_words) return wer @@ -118,7 +124,7 @@ def cer(reference, hypothesis, ignore_case=False): :type ignore_case: bool :return: Character error rate. :rtype: float - :raises ValueError: If reference length is zero. + :raises ValueError: If the reference length is zero. """ if ignore_case == True: reference = reference.lower() @@ -130,6 +136,6 @@ def cer(reference, hypothesis, ignore_case=False): if len(reference) == 0: raise ValueError("Length of reference should be greater than 0.") - edit_distance = levenshtein_distance(reference, hypothesis) + edit_distance = _levenshtein_distance(reference, hypothesis) cer = float(edit_distance) / len(reference) return cer diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py index 57a6ccd6..be7313f3 100644 --- a/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -23,10 +23,8 @@ class TestParse(unittest.TestCase): def test_wer_3(self): ref = ' ' hyp = 'Hypothesis sentence' - try: + with self.assertRaises(ValueError): word_error_rate = error_rate.wer(ref, hyp) - except Exception as e: - self.assertTrue(isinstance(e, ValueError)) def test_cer_1(self): ref = 'werewolf' @@ -53,10 +51,8 @@ class TestParse(unittest.TestCase): def test_cer_5(self): ref = '' hyp = 'Hypothesis' - try: + with self.assertRaises(ValueError): char_error_rate = error_rate.cer(ref, hyp) - except Exception as e: - self.assertTrue(isinstance(e, ValueError)) if __name__ == '__main__': From d35b747175e36b44c369ef9ceb2b9fd7f9cbd9ec Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 23:24:58 +0800 Subject: [PATCH 37/55] Fix ci following: 1. Unify the dependency installation process in setup.sh. 2. Change the version of package scipy from 0.13.0b1 to 0.13.1 --- requirements.txt | 3 +-- setup.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 setup.sh diff --git a/requirements.txt b/requirements.txt index c37e88ff..0183ecf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ SoundFile==0.9.0.post1 wget==3.2 -scikits.samplerate==0.3.3 -scipy==0.13.0b1 +scipy==0.13.1 diff --git a/setup.sh b/setup.sh new file mode 100644 index 00000000..c59ef82f --- /dev/null +++ b/setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# install python dependencies +if [ -f 'requirements.txt' ]; then + pip install -r requirements.txt +fi + +if [ $? != 0 ]; then + exit 1 +fi + +# install scikits.samplerate +curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz" +if [ $? != 0 ]; then + echo "Download libsamplerate-0.1.9.tar.gz failed !!!" + exit 1 +fi +tar -xvf libsamplerate-0.1.9.tar.gz +cd libsamplerate-0.1.9 +./configure && make && make install +cd - +rm -rf libsamplerate-0.1.9 +rm libsamplerate-0.1.9.tar.gz +pip install scikits.samplerate==0.3.3 +if [ $? != 0 ]; then + echo "Install sckikits.samplerate failed !!!" + exit 1 +fi From a5dcd23bf2c44ac261882c89649e7c296ef936b7 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 23:46:44 +0800 Subject: [PATCH 38/55] Follow comments. --- README.md | 2 +- setup.sh | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 23e0b412..0cdb203d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. ``` -pip install -r requirements.txt +sh setup.sh export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH ``` diff --git a/setup.sh b/setup.sh index c59ef82f..1ae2a5ee 100644 --- a/setup.sh +++ b/setup.sh @@ -4,8 +4,8 @@ if [ -f 'requirements.txt' ]; then pip install -r requirements.txt fi - if [ $? != 0 ]; then + echo "Install python dependencies failed !!!" exit 1 fi @@ -23,6 +23,8 @@ rm -rf libsamplerate-0.1.9 rm libsamplerate-0.1.9.tar.gz pip install scikits.samplerate==0.3.3 if [ $? != 0 ]; then - echo "Install sckikits.samplerate failed !!!" + echo "Install scikits.samplerate failed !!!" exit 1 fi + +echo "Install all dependencies successfully." From 115a06bb3739715d75cdadc3b6bc813acd328c99 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Tue, 20 Jun 2017 16:24:03 +0800 Subject: [PATCH 39/55] add augmentor class --- data_utils/audio.py | 2 +- data_utils/augmentor/augmentation.py | 9 ++++ .../online_bayesian_normalization.py | 50 +++++++++++++++++++ data_utils/augmentor/resample.py | 30 +++++++++++ data_utils/augmentor/speed_perturb.py | 43 ++++++++++++++++ data_utils/augmentor/volume_perturb.py | 2 +- 6 files changed, 134 insertions(+), 2 deletions(-) mode change 100644 => 100755 data_utils/audio.py mode change 100644 => 100755 data_utils/augmentor/augmentation.py create mode 100755 data_utils/augmentor/online_bayesian_normalization.py create mode 100755 data_utils/augmentor/resample.py create mode 100755 data_utils/augmentor/speed_perturb.py mode change 100644 => 100755 data_utils/augmentor/volume_perturb.py diff --git a/data_utils/audio.py b/data_utils/audio.py old mode 100644 new mode 100755 index 5d02feb6..03e2d5e4 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -308,7 +308,7 @@ class AudioSegment(object): prior_mean_squared = 10.**(prior_db / 10.) prior_sum_of_squares = prior_mean_squared * prior_samples cumsum_of_squares = np.cumsum(self.samples**2) - sample_count = np.arange(len(self.num_samples)) + 1 + sample_count = np.arange(self.num_samples) + 1 if startup_sample_idx > 0: cumsum_of_squares[:startup_sample_idx] = \ cumsum_of_squares[startup_sample_idx] diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py old mode 100644 new mode 100755 index abe1a0ec..bfe7075e --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -6,6 +6,9 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor +from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor +from data_utils.augmentor.resample import ResampleAugmentor +from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor class AugmentationPipeline(object): @@ -76,5 +79,11 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) + if augmentor_type == "speed": + return SpeedPerturbAugmentor(self._rng, **params) + if augmentor_type == "resample": + return ResampleAugmentor(self._rng, **params) + if augmentor_type == "baysian_normal": + return OnlineBayesianNormalizationAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py new file mode 100755 index 00000000..bb999912 --- /dev/null +++ b/data_utils/augmentor/online_bayesian_normalization.py @@ -0,0 +1,50 @@ +"""Contain the online bayesian normalization augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class OnlineBayesianNormalizationAugmentor(AugmentorBase): + """Augmentation model for adding online bayesian normalization. + + :param rng: Random generator object. + :type rng: random.Random + :param target_db: Target RMS value in decibels. + :type target_db: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: int + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type starup_delay: float. + """ + + def __init__(self, + rng, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + self._target_db = target_db + self._prior_db = prior_db + self._prior_samples = prior_samples + self._startup_delay = startup_delay + self._rng = rng + self._startup_delay=startup_delay + + def transform_audio(self, audio_segment): + """Normalizes the input audio using the online Bayesian approach. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + audio_segment.normalize_online_bayesian(self._target_db, + self._prior_db, + self._prior_samples, + self._startup_delay) diff --git a/data_utils/augmentor/resample.py b/data_utils/augmentor/resample.py new file mode 100755 index 00000000..88ef7ed0 --- /dev/null +++ b/data_utils/augmentor/resample.py @@ -0,0 +1,30 @@ +"""Contain the resample augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class ResampleAugmentor(AugmentorBase): + """Augmentation model for resampling. + + :param rng: Random generator object. + :type rng: random.Random + :param new_sample_rate: New sample rate in Hz + :type new_sample_rate: int + """ + + def __init__(self, rng, new_sample_rate): + self._new_sample_rate = new_sample_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """Resamples the input audio to a target sample rate. + + Note that this is an in-place transformation. + + :param audio: Audio segment to add effects to. + :type audio: AudioSegment|SpeechSegment + """ + audio_segment.resample(self._new_sample_rate) \ No newline at end of file diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py new file mode 100755 index 00000000..67de344c --- /dev/null +++ b/data_utils/augmentor/speed_perturb.py @@ -0,0 +1,43 @@ +"""Contain the speech perturbation augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class SpeedPerturbAugmentor(AugmentorBase): + """Augmentation model for adding speed perturbation. + + See reference paper here: + http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf + + :param rng: Random generator object. + :type rng: random.Random + :param min_speed_rate: Lower bound of new speed rate to sample. + :type min_speed_rate: float + :param max_speed_rate: Upper bound of new speed rate to sample. + :type max_speed_rate: float + """ + + def __init__(self, rng, min_speed_rate, max_speed_rate): + + if (min_speed_rate < 0.5): + raise ValueError("Sampling speed below 0.9 can cause unnatural effects") + if (max_speed_rate > 1.5): + raise ValueError("Sampling speed above 1.1 can cause unnatural effects") + self._min_speed_rate = min_speed_rate + self._max_speed_rate = max_speed_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """Sample a new speed rate from the given range and + changes the speed of the given audio clip. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + sampled_speed = self._rng.uniform(self._min_speed_rate, self._max_speed_rate) + audio_segment.change_speed(sampled_speed) diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py old mode 100644 new mode 100755 index a5a9f6ca..62631fb0 --- a/data_utils/augmentor/volume_perturb.py +++ b/data_utils/augmentor/volume_perturb.py @@ -36,5 +36,5 @@ class VolumePerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ - gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS) + gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS) audio_segment.apply_gain(gain) From 71283d619da6fe0b11d26fde2c701118b55fc25a Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Tue, 20 Jun 2017 16:33:28 +0800 Subject: [PATCH 40/55] add augmentor class --- data_utils/augmentor/resample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_utils/augmentor/resample.py b/data_utils/augmentor/resample.py index 88ef7ed0..6634bbd5 100755 --- a/data_utils/augmentor/resample.py +++ b/data_utils/augmentor/resample.py @@ -11,7 +11,7 @@ class ResampleAugmentor(AugmentorBase): :param rng: Random generator object. :type rng: random.Random - :param new_sample_rate: New sample rate in Hz + :param new_sample_rate: New sample rate in Hz. :type new_sample_rate: int """ @@ -27,4 +27,4 @@ class ResampleAugmentor(AugmentorBase): :param audio: Audio segment to add effects to. :type audio: AudioSegment|SpeechSegment """ - audio_segment.resample(self._new_sample_rate) \ No newline at end of file + audio_segment.resample(self._new_sample_rate) From 1d8cc4a5a9bfd9eff50a9a971411333e9050ff83 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 20 Jun 2017 17:06:53 +0800 Subject: [PATCH 41/55] Add multi-threading support for DS2 data generator. --- data_utils/data.py | 14 +++++++++++--- data_utils/speech.py | 2 +- infer.py | 8 +++++++- train.py | 22 +++++++++++++++++++++- 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/data_utils/data.py b/data_utils/data.py index 424343a4..8391dacc 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -44,6 +44,8 @@ class DataGenerator(object): :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str + :param num_threads: Number of CPU threads for processing data. + :type num_threads: int :param random_seed: Random seed. :type random_seed: int """ @@ -58,6 +60,7 @@ class DataGenerator(object): window_ms=20.0, max_freq=None, specgram_type='linear', + num_threads=12, random_seed=0): self._max_duration = max_duration self._min_duration = min_duration @@ -70,6 +73,7 @@ class DataGenerator(object): stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq) + self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 @@ -207,10 +211,14 @@ class DataGenerator(object): def reader(): for instance in manifest: - yield self._process_utterance(instance["audio_filepath"], - instance["text"]) + yield instance - return reader + def mapper(instance): + return self._process_utterance(instance["audio_filepath"], + instance["text"]) + + return paddle.reader.xmap_readers( + mapper, reader, self._num_threads, 1024, order=True) def _padding_batch(self, batch, padding_to=-1, flatten=False): """ diff --git a/data_utils/speech.py b/data_utils/speech.py index fc031ff4..568e4443 100644 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment): return cls(samples, sample_rate, transcripts) @classmethod - def slice_from_file(cls, filepath, start=None, end=None, transcript): + def slice_from_file(cls, filepath, transcript, start=None, end=None): """Loads a small section of an speech without having to load the entire file into the memory which can be incredibly wasteful. diff --git a/infer.py b/infer.py index 06449ab0..7fc84829 100644 --- a/infer.py +++ b/infer.py @@ -38,6 +38,11 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--num_threads_data", + default=12, + type=int, + help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -67,7 +72,8 @@ def infer(): data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, - augmentation_config='{}') + augmentation_config='{}', + num_threads=args.num_threads_data) # create network config # paddle.data_type.dense_array is used for variable batch input. diff --git a/train.py b/train.py index c60a039b..2c3b8ce7 100644 --- a/train.py +++ b/train.py @@ -52,6 +52,18 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use sortagrad or not. (default: %(default)s)") +parser.add_argument( + "--max_duration", + default=100.0, + type=float, + help="Audios with duration larger than this will be discarded. " + "(default: %(default)s)") +parser.add_argument( + "--min_duration", + default=0.0, + type=float, + help="Audios with duration smaller than this will be discarded. " + "(default: %(default)s)") parser.add_argument( "--shuffle_method", default='instance_shuffle', @@ -63,6 +75,11 @@ parser.add_argument( default=4, type=int, help="Trainer number. (default: %(default)s)") +parser.add_argument( + "--num_threads_data", + default=12, + type=int, + help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -107,7 +124,10 @@ def train(): return DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config) + augmentation_config=args.augmentation_config, + max_duration=args.max_duration, + min_duration=args.min_duration, + num_threads=args.num_threads_data) train_generator = data_generator() test_generator = data_generator() From d104eccf6784585aa54d931b95db9364cac7744e Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 20 Jun 2017 18:13:46 +0800 Subject: [PATCH 42/55] Update the default num_threads for DS2 data generator. --- data_utils/data.py | 3 ++- infer.py | 3 ++- train.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/data_utils/data.py b/data_utils/data.py index 8391dacc..44af7ffa 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -7,6 +7,7 @@ from __future__ import print_function import random import numpy as np +import multiprocessing import paddle.v2 as paddle from data_utils import utils from data_utils.augmentor.augmentation import AugmentationPipeline @@ -60,7 +61,7 @@ class DataGenerator(object): window_ms=20.0, max_freq=None, specgram_type='linear', - num_threads=12, + num_threads=multiprocessing.cpu_count(), random_seed=0): self._max_duration = max_duration self._min_duration = min_duration diff --git a/infer.py b/infer.py index 7fc84829..71518133 100644 --- a/infer.py +++ b/infer.py @@ -6,6 +6,7 @@ from __future__ import print_function import argparse import gzip import distutils.util +import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import deep_speech2 @@ -40,7 +41,7 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=12, + default=multiprocessing.cpu_count(), type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( diff --git a/train.py b/train.py index 2c3b8ce7..fc23ec72 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ import argparse import gzip import time import distutils.util +import multiprocessing import paddle.v2 as paddle from model import deep_speech2 from data_utils.data import DataGenerator @@ -77,7 +78,7 @@ parser.add_argument( help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=12, + default=multiprocessing.cpu_count(), type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( From d64f470078056e1a0e3828ef30c6127596caa30c Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Tue, 20 Jun 2017 18:19:43 +0800 Subject: [PATCH 43/55] add augmentor class --- data_utils/augmentor/augmentation.py | 2 +- tests/test_augmentor.py | 60 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100755 tests/test_augmentor.py diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index bfe7075e..08788008 100755 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -83,7 +83,7 @@ class AugmentationPipeline(object): return SpeedPerturbAugmentor(self._rng, **params) if augmentor_type == "resample": return ResampleAugmentor(self._rng, **params) - if augmentor_type == "baysian_normal": + if augmentor_type == "bayesian_normal": return OnlineBayesianNormalizationAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/tests/test_augmentor.py b/tests/test_augmentor.py new file mode 100755 index 00000000..76fd321a --- /dev/null +++ b/tests/test_augmentor.py @@ -0,0 +1,60 @@ +"""Test augmentor class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +from data_utils import audio +from data_utils.augmentor.augmentation import AugmentationPipeline +import random +import numpy as np + +random_seed=0 +#audio instance +audio_data=[3.05175781e-05, -8.54492188e-04, -1.09863281e-03, -9.46044922e-04,\ + -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.10571289e-03,\ + -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.46044922e-04,\ + -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.31933594e-03,\ + -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.38037109e-03] +audio_data = np.array(audio_data) +samplerate = 10 + +class TestAugmentor(unittest.TestCase): + def test_volume(self): + augmentation_config='[{"type": "volume","params": {"min_gain_dBFS": -15, "max_gain_dBFS": 15},"prob": 1.0}]' + augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, + random_seed=random_seed) + audio_segment = audio.AudioSegment(audio_data, samplerate) + augmentation_pipeline.transform_audio(audio_segment) + original_audio = audio.AudioSegment(audio_data, samplerate) + self.assertFalse(np.any(audio_segment.samples == original_audio.samples)) + + def test_speed(self): + augmentation_config='[{"type": "speed","params": {"min_speed_rate": 1.2,"max_speed_rate": 1.4},"prob": 1.0}]' + augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, + random_seed=random_seed) + audio_segment = audio.AudioSegment(audio_data, samplerate) + augmentation_pipeline.transform_audio(audio_segment) + original_audio = audio.AudioSegment(audio_data, samplerate) + self.assertFalse(np.any(audio_segment.samples == original_audio.samples)) + + def test_resample(self): + augmentation_config='[{"type": "resample","params": {"new_sample_rate":5},"prob": 1.0}]' + augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, + random_seed=random_seed) + audio_segment = audio.AudioSegment(audio_data, samplerate) + augmentation_pipeline.transform_audio(audio_segment) + self.assertTrue(audio_segment.sample_rate == 5) + + def test_bayesial(self): + augmentation_config='[{"type": "bayesian_normal","params": {"target_db": -20, "prior_db": -4, "prior_samples": -8, "startup_delay": 0.0},"prob": 1.0}]' + augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, + random_seed=random_seed) + audio_segment = audio.AudioSegment(audio_data, samplerate) + augmentation_pipeline.transform_audio(audio_segment) + original_audio = audio.AudioSegment(audio_data, samplerate) + self.assertFalse(np.any(audio_segment.samples == original_audio.samples)) + +if __name__ == '__main__': + unittest.main() + From df77c6d5dbb35a2ebd332aa9ad7044bddb52fe5e Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Tue, 20 Jun 2017 18:39:48 +0800 Subject: [PATCH 44/55] Add 3 augmentor classes and related unittests --- tests/test_augmentor.py | 68 ++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/tests/test_augmentor.py b/tests/test_augmentor.py index 76fd321a..17491704 100755 --- a/tests/test_augmentor.py +++ b/tests/test_augmentor.py @@ -11,49 +11,53 @@ import numpy as np random_seed=0 #audio instance -audio_data=[3.05175781e-05, -8.54492188e-04, -1.09863281e-03, -9.46044922e-04,\ - -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.10571289e-03,\ - -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.46044922e-04,\ - -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.31933594e-03,\ - -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.38037109e-03] +audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\ + -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.1057189e-03,\ + -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.4604492e-04,\ + -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.3193354e-03,\ + -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.3803719e-03] audio_data = np.array(audio_data) samplerate = 10 class TestAugmentor(unittest.TestCase): def test_volume(self): - augmentation_config='[{"type": "volume","params": {"min_gain_dBFS": -15, "max_gain_dBFS": 15},"prob": 1.0}]' - augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, - random_seed=random_seed) - audio_segment = audio.AudioSegment(audio_data, samplerate) - augmentation_pipeline.transform_audio(audio_segment) - original_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_segment.samples == original_audio.samples)) + config_json = '[{"type": "volume","params": {"min_gain_dBFS": -15, '\ + '"max_gain_dBFS": 15},"prob": 1.0}]' + aug_pipeline = AugmentationPipeline(augmentation_config=config_json, + random_seed=random_seed) + audio_seg = audio.AudioSegment(audio_data, samplerate) + aug_pipeline.transform_audio(audio_seg) + orig_audio = audio.AudioSegment(audio_data, samplerate) + self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) def test_speed(self): - augmentation_config='[{"type": "speed","params": {"min_speed_rate": 1.2,"max_speed_rate": 1.4},"prob": 1.0}]' - augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, - random_seed=random_seed) - audio_segment = audio.AudioSegment(audio_data, samplerate) - augmentation_pipeline.transform_audio(audio_segment) - original_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_segment.samples == original_audio.samples)) + config_json = '[{"type":"speed","params": {"min_speed_rate": 1.2,' \ + '"max_speed_rate": 1.4},"prob": 1.0}]' + aug_pipeline = AugmentationPipeline(augmentation_config=config_json, + random_seed=random_seed) + audio_seg = audio.AudioSegment(audio_data, samplerate) + aug_pipeline.transform_audio(audio_seg) + orig_audio = audio.AudioSegment(audio_data, samplerate) + self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) def test_resample(self): - augmentation_config='[{"type": "resample","params": {"new_sample_rate":5},"prob": 1.0}]' - augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, - random_seed=random_seed) - audio_segment = audio.AudioSegment(audio_data, samplerate) - augmentation_pipeline.transform_audio(audio_segment) - self.assertTrue(audio_segment.sample_rate == 5) + config_json = '[{"type":"resample","params": {"new_sample_rate":5},'\ + '"prob": 1.0}]' + aug_pipeline = AugmentationPipeline(augmentation_config=config_json, + random_seed=random_seed) + audio_seg = audio.AudioSegment(audio_data, samplerate) + aug_pipeline.transform_audio(audio_seg) + self.assertTrue(audio_seg.sample_rate == 5) def test_bayesial(self): - augmentation_config='[{"type": "bayesian_normal","params": {"target_db": -20, "prior_db": -4, "prior_samples": -8, "startup_delay": 0.0},"prob": 1.0}]' - augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config, - random_seed=random_seed) - audio_segment = audio.AudioSegment(audio_data, samplerate) - augmentation_pipeline.transform_audio(audio_segment) - original_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_segment.samples == original_audio.samples)) + config_json = '[{"type":"bayesian_normal","params":{"target_db":-20,' \ + '"prior_db":-4, "prior_samples": -8, "startup_delay": 0.0},"prob":1.0}]' + aug_pipeline = AugmentationPipeline(augmentation_config=config_json, + random_seed=random_seed) + audio_seg = audio.AudioSegment(audio_data, samplerate) + aug_pipeline.transform_audio(audio_seg) + orig_audio = audio.AudioSegment(audio_data, samplerate) + self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) if __name__ == '__main__': unittest.main() From 5398360e5f5bcbc1d48945395204bd9b708a6768 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Tue, 20 Jun 2017 18:50:13 +0800 Subject: [PATCH 45/55] Add 3 augmentor classes and related unittests --- tests/test_augmentor.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_augmentor.py b/tests/test_augmentor.py index 17491704..57596e63 100755 --- a/tests/test_augmentor.py +++ b/tests/test_augmentor.py @@ -9,8 +9,7 @@ from data_utils.augmentor.augmentation import AugmentationPipeline import random import numpy as np -random_seed=0 -#audio instance +random_seed = 0 audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\ -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.1057189e-03,\ -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.4604492e-04,\ @@ -19,12 +18,13 @@ audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\ audio_data = np.array(audio_data) samplerate = 10 + class TestAugmentor(unittest.TestCase): def test_volume(self): config_json = '[{"type": "volume","params": {"min_gain_dBFS": -15, '\ '"max_gain_dBFS": 15},"prob": 1.0}]' - aug_pipeline = AugmentationPipeline(augmentation_config=config_json, - random_seed=random_seed) + aug_pipeline = AugmentationPipeline( + augmentation_config=config_json, random_seed=random_seed) audio_seg = audio.AudioSegment(audio_data, samplerate) aug_pipeline.transform_audio(audio_seg) orig_audio = audio.AudioSegment(audio_data, samplerate) @@ -33,8 +33,8 @@ class TestAugmentor(unittest.TestCase): def test_speed(self): config_json = '[{"type":"speed","params": {"min_speed_rate": 1.2,' \ '"max_speed_rate": 1.4},"prob": 1.0}]' - aug_pipeline = AugmentationPipeline(augmentation_config=config_json, - random_seed=random_seed) + aug_pipeline = AugmentationPipeline( + augmentation_config=config_json, random_seed=random_seed) audio_seg = audio.AudioSegment(audio_data, samplerate) aug_pipeline.transform_audio(audio_seg) orig_audio = audio.AudioSegment(audio_data, samplerate) @@ -43,8 +43,8 @@ class TestAugmentor(unittest.TestCase): def test_resample(self): config_json = '[{"type":"resample","params": {"new_sample_rate":5},'\ '"prob": 1.0}]' - aug_pipeline = AugmentationPipeline(augmentation_config=config_json, - random_seed=random_seed) + aug_pipeline = AugmentationPipeline( + augmentation_config=config_json, random_seed=random_seed) audio_seg = audio.AudioSegment(audio_data, samplerate) aug_pipeline.transform_audio(audio_seg) self.assertTrue(audio_seg.sample_rate == 5) @@ -52,13 +52,13 @@ class TestAugmentor(unittest.TestCase): def test_bayesial(self): config_json = '[{"type":"bayesian_normal","params":{"target_db":-20,' \ '"prior_db":-4, "prior_samples": -8, "startup_delay": 0.0},"prob":1.0}]' - aug_pipeline = AugmentationPipeline(augmentation_config=config_json, - random_seed=random_seed) + aug_pipeline = AugmentationPipeline( + augmentation_config=config_json, random_seed=random_seed) audio_seg = audio.AudioSegment(audio_data, samplerate) aug_pipeline.transform_audio(audio_seg) orig_audio = audio.AudioSegment(audio_data, samplerate) self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) + if __name__ == '__main__': unittest.main() - From 2450591a440dfc863cce53152416e594bdfff6b3 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Wed, 21 Jun 2017 11:47:15 +0800 Subject: [PATCH 46/55] add 3 augmentor class and change resample module --- data_utils/audio.py | 16 ++++---- data_utils/augmentor/resample.py | 5 ++- requirements.txt | 1 + tests/test_augmentor.py | 64 -------------------------------- 4 files changed, 12 insertions(+), 74 deletions(-) mode change 100644 => 100755 requirements.txt delete mode 100755 tests/test_augmentor.py diff --git a/data_utils/audio.py b/data_utils/audio.py index 03e2d5e4..f80425ea 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -6,7 +6,7 @@ from __future__ import print_function import numpy as np import io import soundfile -import scikits.samplerate +import resampy from scipy import signal import random import copy @@ -321,21 +321,19 @@ class AudioSegment(object): gain_db = target_db - rms_estimate_db self.apply_gain(gain_db) - def resample(self, target_sample_rate, quality='sinc_medium'): + def resample(self, target_sample_rate, filter='kaiser_best'): """Resample the audio to a target sample rate. Note that this is an in-place transformation. :param target_sample_rate: Target sample rate. :type target_sample_rate: int - :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. - Sets resampling speed/quality tradeoff. - See http://www.mega-nerd.com/SRC/api_misc.html#Converters - :type quality: str + :param filter: The resampling filter to use one of {'kaiser_best', + 'kaiser_fast'}. + :type filter: str """ - resample_ratio = target_sample_rate / self._sample_rate - self._samples = scikits.samplerate.resample( - self._samples, r=resample_ratio, type=quality) + self._samples = resampy.resample( + self.samples, self.sample_rate, target_sample_rate, filter=filter) self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): diff --git a/data_utils/augmentor/resample.py b/data_utils/augmentor/resample.py index 6634bbd5..529b5fec 100755 --- a/data_utils/augmentor/resample.py +++ b/data_utils/augmentor/resample.py @@ -8,6 +8,9 @@ from data_utils.augmentor.base import AugmentorBase class ResampleAugmentor(AugmentorBase): """Augmentation model for resampling. + + See more info here: + https://ccrma.stanford.edu/~jos/resample/index.html :param rng: Random generator object. :type rng: random.Random @@ -27,4 +30,4 @@ class ResampleAugmentor(AugmentorBase): :param audio: Audio segment to add effects to. :type audio: AudioSegment|SpeechSegment """ - audio_segment.resample(self._new_sample_rate) + audio_segment.resample(self._new_sample_rate) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 index 0183ecf0..d712787f --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ SoundFile==0.9.0.post1 wget==3.2 scipy==0.13.1 +resampy==0.1.5 \ No newline at end of file diff --git a/tests/test_augmentor.py b/tests/test_augmentor.py deleted file mode 100755 index 57596e63..00000000 --- a/tests/test_augmentor.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Test augmentor class.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import unittest -from data_utils import audio -from data_utils.augmentor.augmentation import AugmentationPipeline -import random -import numpy as np - -random_seed = 0 -audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\ - -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.1057189e-03,\ - -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.4604492e-04,\ - -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.3193354e-03,\ - -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.3803719e-03] -audio_data = np.array(audio_data) -samplerate = 10 - - -class TestAugmentor(unittest.TestCase): - def test_volume(self): - config_json = '[{"type": "volume","params": {"min_gain_dBFS": -15, '\ - '"max_gain_dBFS": 15},"prob": 1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - orig_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) - - def test_speed(self): - config_json = '[{"type":"speed","params": {"min_speed_rate": 1.2,' \ - '"max_speed_rate": 1.4},"prob": 1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - orig_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) - - def test_resample(self): - config_json = '[{"type":"resample","params": {"new_sample_rate":5},'\ - '"prob": 1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - self.assertTrue(audio_seg.sample_rate == 5) - - def test_bayesial(self): - config_json = '[{"type":"bayesian_normal","params":{"target_db":-20,' \ - '"prior_db":-4, "prior_samples": -8, "startup_delay": 0.0},"prob":1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - orig_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) - - -if __name__ == '__main__': - unittest.main() From d6a852a304babcd916d35c58ec0470162891c583 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Wed, 21 Jun 2017 12:11:43 +0800 Subject: [PATCH 47/55] modify setup.sh to delete the install of libsamplerate --- .../augmentor/online_bayesian_normalization.py | 6 ++---- setup.sh | 18 ------------------ 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py index bb999912..e488ac7d 100755 --- a/data_utils/augmentor/online_bayesian_normalization.py +++ b/data_utils/augmentor/online_bayesian_normalization.py @@ -32,9 +32,8 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase): self._target_db = target_db self._prior_db = prior_db self._prior_samples = prior_samples - self._startup_delay = startup_delay self._rng = rng - self._startup_delay=startup_delay + self._startup_delay = startup_delay def transform_audio(self, audio_segment): """Normalizes the input audio using the online Bayesian approach. @@ -44,7 +43,6 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegment|SpeechSegment """ - audio_segment.normalize_online_bayesian(self._target_db, - self._prior_db, + audio_segment.normalize_online_bayesian(self._target_db, self._prior_db, self._prior_samples, self._startup_delay) diff --git a/setup.sh b/setup.sh index 1ae2a5ee..e0ce1c4e 100644 --- a/setup.sh +++ b/setup.sh @@ -9,22 +9,4 @@ if [ $? != 0 ]; then exit 1 fi -# install scikits.samplerate -curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz" -if [ $? != 0 ]; then - echo "Download libsamplerate-0.1.9.tar.gz failed !!!" - exit 1 -fi -tar -xvf libsamplerate-0.1.9.tar.gz -cd libsamplerate-0.1.9 -./configure && make && make install -cd - -rm -rf libsamplerate-0.1.9 -rm libsamplerate-0.1.9.tar.gz -pip install scikits.samplerate==0.3.3 -if [ $? != 0 ]; then - echo "Install scikits.samplerate failed !!!" - exit 1 -fi - echo "Install all dependencies successfully." From b340d4ed2fbdc487b555e3395d3093410e014a98 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Wed, 21 Jun 2017 12:18:33 +0800 Subject: [PATCH 48/55] modify setup.sh to delete the install of libsamplerate --- data_utils/augmentor/speed_perturb.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py index 67de344c..3f880fbb 100755 --- a/data_utils/augmentor/speed_perturb.py +++ b/data_utils/augmentor/speed_perturb.py @@ -23,9 +23,11 @@ class SpeedPerturbAugmentor(AugmentorBase): def __init__(self, rng, min_speed_rate, max_speed_rate): if (min_speed_rate < 0.5): - raise ValueError("Sampling speed below 0.9 can cause unnatural effects") + raise ValueError("Sampling speed below 0.9 can cause unnatural "\ + "effects") if (max_speed_rate > 1.5): - raise ValueError("Sampling speed above 1.1 can cause unnatural effects") + raise ValueError("Sampling speed above 1.1 can cause unnatural "\ + "effects") self._min_speed_rate = min_speed_rate self._max_speed_rate = max_speed_rate self._rng = rng @@ -39,5 +41,6 @@ class SpeedPerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegment|SpeechSegment """ - sampled_speed = self._rng.uniform(self._min_speed_rate, self._max_speed_rate) + sampled_speed = self._rng.uniform(self._min_speed_rate, + self._max_speed_rate) audio_segment.change_speed(sampled_speed) From 13f708739ba956aa3c63b91e529827bc73d3e160 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 21 Jun 2017 20:52:30 +0800 Subject: [PATCH 49/55] Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. --- README.md | 4 +- data_utils/audio.py | 157 ++++++++++++--------- data_utils/augmentor/augmentation.py | 3 + data_utils/augmentor/volume_perturb.py | 2 +- data_utils/data.py | 7 +- data_utils/featurizer/audio_featurizer.py | 42 +++++- data_utils/featurizer/speech_featurizer.py | 24 +++- infer.py | 2 +- setup.sh | 3 + train.py | 19 ++- 10 files changed, 180 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 0cdb203d..2912ff31 100644 --- a/README.md +++ b/README.md @@ -51,13 +51,13 @@ python compute_mean_std.py --help For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False +python train.py --use_gpu False ``` More help for arguments: diff --git a/data_utils/audio.py b/data_utils/audio.py index 5d02feb6..1faeb48a 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -66,6 +66,54 @@ class AudioSegment(object): samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) + @classmethod + def slice_from_file(cls, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath or file object. + :type file: basestring|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: AudioSegment instance of the specified slice of the input + audio file. + :rtype: AudioSegment + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = 0. if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return cls(data, sample_rate) + @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. @@ -105,6 +153,20 @@ class AudioSegment(object): samples = np.concatenate([seg.samples for seg in segments]) return cls(samples, sample_rate) + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent audio segment of the given duration and sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silent AudioSegment instance of the given duration. + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return cls(samples, sample_rate) + def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -130,68 +192,6 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) - @classmethod - def slice_from_file(cls, file, start=None, end=None): - """Loads a small section of an audio without having to load - the entire file into the memory which can be incredibly wasteful. - - :param file: Input audio filepath or file object. - :type file: basestring|file - :param start: Start time in seconds. If start is negative, it wraps - around from the end. If not provided, this function - reads from the very beginning. - :type start: float - :param end: End time in seconds. If end is negative, it wraps around - from the end. If not provided, the default behvaior is - to read to the end of the file. - :type end: float - :return: AudioSegment instance of the specified slice of the input - audio file. - :rtype: AudioSegment - :raise ValueError: If start or end is incorrectly set, e.g. out of - bounds in time. - """ - sndfile = soundfile.SoundFile(file) - sample_rate = sndfile.samplerate - duration = float(len(sndfile)) / sample_rate - start = 0. if start is None else start - end = 0. if end is None else end - if start < 0.0: - start += duration - if end < 0.0: - end += duration - if start < 0.0: - raise ValueError("The slice start position (%f s) is out of " - "bounds." % start) - if end < 0.0: - raise ValueError("The slice end position (%f s) is out of bounds." % - end) - if start > end: - raise ValueError("The slice start position (%f s) is later than " - "the slice end position (%f s)." % (start, end)) - if end > duration: - raise ValueError("The slice end position (%f s) is out of bounds " - "(> %f s)" % (end, duration)) - start_frame = int(start * sample_rate) - end_frame = int(end * sample_rate) - sndfile.seek(start_frame) - data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - return cls(data, sample_rate) - - @classmethod - def make_silence(cls, duration, sample_rate): - """Creates a silent audio segment of the given duration and sample rate. - - :param duration: Length of silence in seconds. - :type duration: float - :param sample_rate: Sample rate. - :type sample_rate: float - :return: Silent AudioSegment instance of the given duration. - :rtype: AudioSegment - """ - samples = np.zeros(int(duration * sample_rate)) - return cls(samples, sample_rate) - def superimpose(self, other): """Add samples from another segment to those of this segment (sample-wise addition, not segment concatenation). @@ -225,7 +225,7 @@ class AudioSegment(object): samples = self._convert_samples_from_float32(self._samples, dtype) return samples.tostring() - def apply_gain(self, gain): + def gain_db(self, gain): """Apply gain in decibels to samples. Note that this is an in-place transformation. @@ -278,7 +278,7 @@ class AudioSegment(object): "Unable to normalize segment to %f dB because the " "the probable gain have exceeds max_gain_db (%f dB)" % (target_db, max_gain_db)) - self.apply_gain(min(max_gain_db, target_db - self.rms_db)) + self.gain_db(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, target_db, @@ -319,7 +319,7 @@ class AudioSegment(object): rms_estimate_db = 10 * np.log10(mean_squared_estimate) # Compute required time-varying gain. gain_db = target_db - rms_estimate_db - self.apply_gain(gain_db) + self.gain_db(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): """Resample the audio to a target sample rate. @@ -366,6 +366,31 @@ class AudioSegment(object): raise ValueError("Unknown value for the sides %s" % sides) self._samples = padded._samples + def shift(self, shift_ms): + """Shift the audio in time. If `shift_ms` is positive, shift with time + advance; if negative, shift with time delay. Silence are padded to + keep the duration unchanged. + + Note that this is an in-place transformation. + + :param shift_ms: Shift time in millseconds. If positive, shift with + time advance; if negative; shift with time delay. + :type shift_ms: float + :raises ValueError: If shift_ms is longer than audio duration. + """ + if shift_ms / 1000.0 > self.duration: + raise ValueError("Absolute value of shift_ms should be smaller " + "than audio duration.") + shift_samples = int(shift_ms * self._sample_rate / 1000) + if shift_samples > 0: + # time advance + self._samples[:-shift_samples] = self._samples[shift_samples:] + self._samples[-shift_samples:] = 0 + elif shift_samples < 0: + # time delay + self._samples[-shift_samples:] = self._samples[:shift_samples] + self._samples[:-shift_samples] = 0 + def subsegment(self, start_sec=None, end_sec=None): """Cut the AudioSegment between given boundaries. @@ -505,7 +530,7 @@ class AudioSegment(object): noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) noise_new = copy.deepcopy(noise) noise_new.random_subsegment(self.duration, rng=rng) - noise_new.apply_gain(noise_gain_db) + noise_new.gain_db(noise_gain_db) self.superimpose(noise_new) @property diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index abe1a0ec..0d60bbdb 100644 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -6,6 +6,7 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor +from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor class AugmentationPipeline(object): @@ -76,5 +77,7 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) + elif augmentor_type == "shift": + return ShiftPerturbAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py index a5a9f6ca..62631fb0 100644 --- a/data_utils/augmentor/volume_perturb.py +++ b/data_utils/augmentor/volume_perturb.py @@ -36,5 +36,5 @@ class VolumePerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ - gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS) + gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS) audio_segment.apply_gain(gain) diff --git a/data_utils/data.py b/data_utils/data.py index 44af7ffa..d01ca8cc 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -45,6 +45,9 @@ class DataGenerator(object): :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str + :param use_dB_normalization: Whether to normalize the audio to -20 dB + before extracting the features. + :type use_dB_normalization: bool :param num_threads: Number of CPU threads for processing data. :type num_threads: int :param random_seed: Random seed. @@ -61,6 +64,7 @@ class DataGenerator(object): window_ms=20.0, max_freq=None, specgram_type='linear', + use_dB_normalization=True, num_threads=multiprocessing.cpu_count(), random_seed=0): self._max_duration = max_duration @@ -73,7 +77,8 @@ class DataGenerator(object): specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, - max_freq=max_freq) + max_freq=max_freq, + use_dB_normalization=use_dB_normalization) self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 9f9d4e50..4b4d02c6 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -24,26 +24,64 @@ class AudioFeaturizer(object): corresponding to frequencies between [0, max_freq] are returned. :types max_freq: None|float + :param target_sample_rate: Audio are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float """ def __init__(self, specgram_type='linear', stride_ms=10.0, window_ms=20.0, - max_freq=None): + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20): self._specgram_type = specgram_type self._stride_ms = stride_ms self._window_ms = window_ms self._max_freq = max_freq + self._target_sample_rate = target_sample_rate + self._use_dB_normalization = use_dB_normalization + self._target_dB = target_dB - def featurize(self, audio_segment): + def featurize(self, + audio_segment, + allow_downsampling=True, + allow_upsamplling=True): """Extract audio features from AudioSegment or SpeechSegment. :param audio_segment: Audio/speech segment to extract features from. :type audio_segment: AudioSegment|SpeechSegment + :param allow_downsampling: Whether to allow audio downsampling before + featurizing. + :type allow_downsampling: bool + :param allow_upsampling: Whether to allow audio upsampling before + featurizing. + :type allow_upsampling: bool :return: Spectrogram audio feature in 2darray. :rtype: ndarray + :raises ValueError: If audio sample rate is not supported. """ + # upsampling or downsampling + if ((audio_segment.sample_rate > self._target_sample_rate and + allow_downsampling) or + (audio_segment.sample_rate < self._target_sample_rate and + allow_upsampling)): + audio_segment.resample(self._target_sample_rate) + if audio_segment.sample_rate != self._target_sample_rate: + raise ValueError("Audio sample rate is not supported. " + "Turn allow_downsampling or allow up_sampling on.") + # decibel normalization + if self._use_dB_normalization: + audio_segment.normalize(target_db=self._target_dB) + # extract spectrogram return self._compute_specgram(audio_segment.samples, audio_segment.sample_rate) diff --git a/data_utils/featurizer/speech_featurizer.py b/data_utils/featurizer/speech_featurizer.py index 77020455..26283892 100644 --- a/data_utils/featurizer/speech_featurizer.py +++ b/data_utils/featurizer/speech_featurizer.py @@ -29,6 +29,15 @@ class SpeechFeaturizer(object): corresponding to frequencies between [0, max_freq] are returned. :types max_freq: None|float + :param target_sample_rate: Speech are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float """ def __init__(self, @@ -36,9 +45,18 @@ class SpeechFeaturizer(object): specgram_type='linear', stride_ms=10.0, window_ms=20.0, - max_freq=None): - self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms, - window_ms, max_freq) + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20): + self._audio_featurizer = AudioFeaturizer( + specgram_type=specgram_type, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB) self._text_featurizer = TextFeaturizer(vocab_filepath) def featurize(self, speech_segment): diff --git a/infer.py b/infer.py index 71518133..9037a108 100644 --- a/infer.py +++ b/infer.py @@ -56,7 +56,7 @@ parser.add_argument( help="Manifest path for decoding. (default: %(default)s)") parser.add_argument( "--model_filepath", - default='./params.tar.gz', + default='checkpoints/params.latest.tar.gz', type=str, help="Model filepath. (default: %(default)s)") parser.add_argument( diff --git a/setup.sh b/setup.sh index 1ae2a5ee..cdec34ff 100644 --- a/setup.sh +++ b/setup.sh @@ -27,4 +27,7 @@ if [ $? != 0 ]; then exit 1 fi +# prepare ./checkpoints +mkdir checkpoints + echo "Install all dependencies successfully." diff --git a/train.py b/train.py index fc23ec72..3a2d0cad 100644 --- a/train.py +++ b/train.py @@ -17,10 +17,10 @@ import utils parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - "--batch_size", default=32, type=int, help="Minibatch size.") + "--batch_size", default=256, type=int, help="Minibatch size.") parser.add_argument( "--num_passes", - default=20, + default=200, type=int, help="Training pass number. (default: %(default)s)") parser.add_argument( @@ -55,7 +55,7 @@ parser.add_argument( help="Use sortagrad or not. (default: %(default)s)") parser.add_argument( "--max_duration", - default=100.0, + default=27.0, type=float, help="Audios with duration larger than this will be discarded. " "(default: %(default)s)") @@ -67,13 +67,13 @@ parser.add_argument( "(default: %(default)s)") parser.add_argument( "--shuffle_method", - default='instance_shuffle', + default='batch_shuffle_clipped', type=str, help="Shuffle method: 'instance_shuffle', 'batch_shuffle', " "'batch_shuffle_batch'. (default: %(default)s)") parser.add_argument( "--trainer_count", - default=4, + default=8, type=int, help="Trainer number. (default: %(default)s)") parser.add_argument( @@ -110,7 +110,9 @@ parser.add_argument( "the existing model of this path. (default: %(default)s)") parser.add_argument( "--augmentation_config", - default='{}', + default='[{"type": "shift", ' + '"params": {"min_shift_ms": -5, "max_shift_ms": 5},' + '"prob": 1.0}]', type=str, help="Augmentation configuration in json-format. " "(default: %(default)s)") @@ -189,7 +191,7 @@ def train(): print("\nPass: %d, Batch: %d, TrainCost: %f" % ( event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) cost_sum, cost_counter = 0.0, 0 - with gzip.open("params.tar.gz", 'w') as f: + with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f: parameters.to_tar(f) else: sys.stdout.write('.') @@ -202,6 +204,9 @@ def train(): reader=test_batch_reader, feeding=test_generator.feeding) print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % (time.time() - start_time, event.pass_id, result.cost)) + with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id, + 'w') as f: + parameters.to_tar(f) # run train trainer.train( From 6d6cdf40576dff0086e221a3d5e761530e24f811 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 26 Jun 2017 13:04:36 +0800 Subject: [PATCH 50/55] Refine SoundFile installation process. 1. Install libsndfile first. 2. Install SoundFile using pip. --- requirements.txt | 1 - setup.sh | 20 ++++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0183ecf0..79272e7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ -SoundFile==0.9.0.post1 wget==3.2 scipy==0.13.1 diff --git a/setup.sh b/setup.sh index 1ae2a5ee..a801a0b2 100644 --- a/setup.sh +++ b/setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # install python dependencies -if [ -f 'requirements.txt' ]; then +if [ -f "requirements.txt" ]; then pip install -r requirements.txt fi if [ $? != 0 ]; then @@ -9,21 +9,21 @@ if [ $? != 0 ]; then exit 1 fi -# install scikits.samplerate -curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz" +# install package Soundfile +curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" if [ $? != 0 ]; then - echo "Download libsamplerate-0.1.9.tar.gz failed !!!" + echo "Download libsndfile-1.0.28.tar.gz failed !!!" exit 1 fi -tar -xvf libsamplerate-0.1.9.tar.gz -cd libsamplerate-0.1.9 +tar -zxvf libsndfile-1.0.28.tar.gz +cd libsndfile-1.0.28 ./configure && make && make install cd - -rm -rf libsamplerate-0.1.9 -rm libsamplerate-0.1.9.tar.gz -pip install scikits.samplerate==0.3.3 +rm -rf libsndfile-1.0.28 +rm libsndfile-1.0.28.tar.gz +pip install SoundFile==0.9.0.post1 if [ $? != 0 ]; then - echo "Install scikits.samplerate failed !!!" + echo "Install SoundFile failed !!!" exit 1 fi From cdd52ac2706929ea993038aedce3080eb2de8af8 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 26 Jun 2017 14:17:22 +0800 Subject: [PATCH 51/55] Fix a missing abs bug for DS2 AudioSegment. --- data_utils/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 1faeb48a..d55fae1e 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -378,7 +378,7 @@ class AudioSegment(object): :type shift_ms: float :raises ValueError: If shift_ms is longer than audio duration. """ - if shift_ms / 1000.0 > self.duration: + if abs(shift_ms) / 1000.0 > self.duration: raise ValueError("Absolute value of shift_ms should be smaller " "than audio duration.") shift_samples = int(shift_ms * self._sample_rate / 1000) From 0dadd14600dfae51995c75746ee0c237f83995d1 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 26 Jun 2017 19:19:16 +0800 Subject: [PATCH 52/55] Patch for adding missing shift_perturb.py in last commmit (pull request #114). --- data_utils/augmentor/shift_perturb.py | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 data_utils/augmentor/shift_perturb.py diff --git a/data_utils/augmentor/shift_perturb.py b/data_utils/augmentor/shift_perturb.py new file mode 100644 index 00000000..c4cbe3e1 --- /dev/null +++ b/data_utils/augmentor/shift_perturb.py @@ -0,0 +1,34 @@ +"""Contains the volume perturb augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class ShiftPerturbAugmentor(AugmentorBase): + """Augmentation model for adding random shift perturbation. + + :param rng: Random generator object. + :type rng: random.Random + :param min_shift_ms: Minimal shift in milliseconds. + :type min_shift_ms: float + :param max_shift_ms: Maximal shift in milliseconds. + :type max_shift_ms: float + """ + + def __init__(self, rng, min_shift_ms, max_shift_ms): + self._min_shift_ms = min_shift_ms + self._max_shift_ms = max_shift_ms + self._rng = rng + + def transform_audio(self, audio_segment): + """Shift audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + audio_segment.shift(shift_ms) From 29f6ae08076d9811ab6aae91ffff3c0dfaf7bc85 Mon Sep 17 00:00:00 2001 From: xushaoyong Date: Tue, 27 Jun 2017 17:16:35 +0800 Subject: [PATCH 53/55] modify audio resample function --- data_utils/audio.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 3d9b6c11..3891f5b9 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -332,7 +332,6 @@ class AudioSegment(object): 'kaiser_fast'}. :type filter: str """ - resample_ratio = target_sample_rate / self._sample_rate self._samples = resampy.resample( self.samples, self.sample_rate, target_sample_rate, filter=filter) self._sample_rate = target_sample_rate From e1e2914ec9f0972825c32c83531805ed458728d0 Mon Sep 17 00:00:00 2001 From: xushaoyong Date: Tue, 27 Jun 2017 18:06:16 +0800 Subject: [PATCH 54/55] remove augmentor unittest --- tests/test_augmentor.py | 65 ----------------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 tests/test_augmentor.py diff --git a/tests/test_augmentor.py b/tests/test_augmentor.py deleted file mode 100644 index ee1f5439..00000000 --- a/tests/test_augmentor.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Test augmentor class.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import unittest -from data_utils import audio -from data_utils.augmentor.augmentation import AugmentationPipeline -import random -import numpy as np - -random_seed = 0 -#audio instance -audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\ - -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.1057189e-03,\ - -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.4604492e-04,\ - -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.3193354e-03,\ - -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.3803719e-03] -audio_data = np.array(audio_data) -samplerate = 10 - - -class TestAugmentor(unittest.TestCase): - def test_volume(self): - config_json = '[{"type": "volume","params": {"min_gain_dBFS": -15, '\ - '"max_gain_dBFS": 15},"prob": 1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - orig_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) - - def test_speed(self): - config_json = '[{"type":"speed","params": {"min_speed_rate": 0.9,' \ - '"max_speed_rate": 1.1},"prob": 1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - orig_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) - - def test_resample(self): - config_json = '[{"type":"resample","params": {"new_sample_rate":5},'\ - '"prob": 1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - self.assertTrue(audio_seg.sample_rate == 5) - - def test_bayesial(self): - config_json = '[{"type":"bayesian_normal","params":{"target_db":-20,' \ - '"prior_db":-4, "prior_samples": -8, "startup_delay": 0.0},"prob":1.0}]' - aug_pipeline = AugmentationPipeline( - augmentation_config=config_json, random_seed=random_seed) - audio_seg = audio.AudioSegment(audio_data, samplerate) - aug_pipeline.transform_audio(audio_seg) - orig_audio = audio.AudioSegment(audio_data, samplerate) - self.assertFalse(np.any(audio_seg.samples == orig_audio.samples)) - - -if __name__ == '__main__': - unittest.main() From db37c34919e5cb7377e8ed863a17d206a0d28c39 Mon Sep 17 00:00:00 2001 From: xushaoyong Date: Tue, 27 Jun 2017 18:48:49 +0800 Subject: [PATCH 55/55] modify some detail of augmentor --- data_utils/augmentor/augmentation.py | 3 ++- data_utils/augmentor/speed_perturb.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index f8fd214a..9dced473 100644 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -9,7 +9,8 @@ from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor from data_utils.augmentor.resample import ResampleAugmentor -from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor +from data_utils.augmentor.online_bayesian_normalization import \ + OnlineBayesianNormalizationAugmentor class AugmentationPipeline(object): diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py index 8c6c8b63..cc5738bd 100644 --- a/data_utils/augmentor/speed_perturb.py +++ b/data_utils/augmentor/speed_perturb.py @@ -15,10 +15,10 @@ class SpeedPerturbAugmentor(AugmentorBase): :param rng: Random generator object. :type rng: random.Random :param min_speed_rate: Lower bound of new speed rate to sample and should - not below 0.9. + not be smaller than 0.9. :type min_speed_rate: float :param max_speed_rate: Upper bound of new speed rate to sample and should - not above 1.1. + not be larger than 1.1. :type max_speed_rate: float """