"""Contains DeepSpeech2 model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import time import logging import gzip import copy import inspect from distutils.dir_util import mkpath import paddle.v2 as paddle from decoders.swig_wrapper import Scorer from decoders.swig_wrapper import ctc_greedy_decoder from decoders.swig_wrapper import ctc_beam_search_decoder_batch from model_utils.network import deep_speech_v2_network logging.basicConfig( format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s') class DeepSpeech2Model(object): """DeepSpeech2Model class. :param vocab_size: Decoding vocabulary size. :type vocab_size: int :param num_conv_layers: Number of stacking convolution layers. :type num_conv_layers: int :param num_rnn_layers: Number of stacking RNN layers. :type num_rnn_layers: int :param rnn_layer_size: RNN layer size (number of RNN cells). :type rnn_layer_size: int :param pretrained_model_path: Pretrained model path. If None, will train from stratch. :type pretrained_model_path: basestring|None :param share_rnn_weights: Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. :type share_rnn_weights: bool """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, use_gru, pretrained_model_path, share_rnn_weights): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, use_gru, share_rnn_weights) self._create_parameters(pretrained_model_path) self._inferer = None self._loss_inferer = None self._ext_scorer = None self._num_conv_layers = num_conv_layers self.logger = logging.getLogger("") self.logger.setLevel(level=logging.INFO) def train(self, train_batch_reader, dev_batch_reader, feeding_dict, learning_rate, gradient_clipping, num_passes, output_model_dir, is_local=True, num_iterations_print=100, test_off=False): """Train the model. :param train_batch_reader: Train data reader. :type train_batch_reader: callable :param dev_batch_reader: Validation data reader. :type dev_batch_reader: callable :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :param learning_rate: Learning rate for ADAM optimizer. :type learning_rate: float :param gradient_clipping: Gradient clipping threshold. :type gradient_clipping: float :param num_passes: Number of training epochs. :type num_passes: int :param num_iterations_print: Number of training iterations for printing a training loss. :type rnn_iteratons_print: int :param is_local: Set to False if running with pserver with multi-nodes. :type is_local: bool :param output_model_dir: Directory for saving the model (every pass). :type output_model_dir: basestring :param test_off: Turn off testing. :type test_off: bool """ # prepare model output directory if not os.path.exists(output_model_dir): mkpath(output_model_dir) # adapt the feeding dict and reader according to the network adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict) adapted_train_batch_reader = self._adapt_data(train_batch_reader) adapted_dev_batch_reader = self._adapt_data(dev_batch_reader) # prepare optimizer and trainer optimizer = paddle.optimizer.Adam( learning_rate=learning_rate, gradient_clipping_threshold=gradient_clipping) trainer = paddle.trainer.SGD( cost=self._loss, parameters=self._parameters, update_equation=optimizer, is_local=is_local) # create event handler def event_handler(event): global start_time, cost_sum, cost_counter if isinstance(event, paddle.event.EndIteration): cost_sum += event.cost cost_counter += 1 if (event.batch_id + 1) % num_iterations_print == 0: output_model_path = os.path.join(output_model_dir, "params.latest.tar.gz") with gzip.open(output_model_path, 'w') as f: trainer.save_parameter_to_tar(f) print("\nPass: %d, Batch: %d, TrainCost: %f" % (event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) cost_sum, cost_counter = 0.0, 0 else: sys.stdout.write('.') sys.stdout.flush() if isinstance(event, paddle.event.BeginPass): start_time = time.time() cost_sum, cost_counter = 0.0, 0 if isinstance(event, paddle.event.EndPass): if test_off: print("\n------- Time: %d sec, Pass: %d" % (time.time() - start_time, event.pass_id)) else: result = trainer.test( reader=adapted_dev_batch_reader, feeding=adapted_feeding_dict) print( "\n------- Time: %d sec, Pass: %d, " "ValidationCost: %s" % (time.time() - start_time, event.pass_id, result.cost)) output_model_path = os.path.join( output_model_dir, "params.pass-%d.tar.gz" % event.pass_id) with gzip.open(output_model_path, 'w') as f: trainer.save_parameter_to_tar(f) # run train trainer.train( reader=adapted_train_batch_reader, event_handler=event_handler, num_passes=num_passes, feeding=adapted_feeding_dict) # TODO(@pkuyym) merge this function into infer_batch def infer_loss_batch(self, infer_data): """Model inference. Infer the ctc loss for a batch of speech utterances. :param infer_data: List of utterances to infer, with each utterance a tuple of audio features and transcription text (empty string). :type infer_data: list :return: List of ctc loss. :rtype: List of float """ # define inferer if self._loss_inferer == None: self._loss_inferer = paddle.inference.Inference( output_layer=self._loss, parameters=self._parameters) # run inference return self._loss_inferer.infer(input=infer_data) def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, vocab_list, language_model_path, num_processes, feeding_dict): """Model inference. Infer the transcription for a batch of speech utterances. :param infer_data: List of utterances to infer, with each utterance consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list :param decoding_method: Decoding method name, 'ctc_greedy' or 'ctc_beam_search'. :param decoding_method: string :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. :type beam_beta: float :param beam_size: Width for Beam search. :type beam_size: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. :type cutoff_prob: float :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n characters with highest probs in vocabulary will be used in beam search, default 40. :type cutoff_top_n: int :param vocab_list: List of tokens in the vocabulary, for decoding. :type vocab_list: list :param language_model_path: Filepath for language model. :type language_model_path: basestring|None :param num_processes: Number of processes (CPU) for decoder. :type num_processes: int :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :return: List of transcription texts. :rtype: List of basestring """ # define inferer if self._inferer == None: self._inferer = paddle.inference.Inference( output_layer=self._log_probs, parameters=self._parameters) adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict) adapted_infer_data = self._adapt_data(infer_data) # run inference infer_results = self._inferer.infer( input=adapted_infer_data, feeding=adapted_feeding_dict) start_pos = [0] * (len(adapted_infer_data) + 1) for i in xrange(len(adapted_infer_data)): start_pos[i + 1] = start_pos[i] + adapted_infer_data[i][3][0] probs_split = [ infer_results[start_pos[i]:start_pos[i + 1]] for i in xrange(0, len(adapted_infer_data)) ] # run decoder results = [] if decoding_method == "ctc_greedy": # best path decode for i, probs in enumerate(probs_split): output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) elif decoding_method == "ctc_beam_search": # initialize external scorer if self._ext_scorer == None: self._loaded_lm_path = language_model_path self.logger.info("begin to initialize the external scorer " "for decoding") self._ext_scorer = Scorer(beam_alpha, beam_beta, language_model_path, vocab_list) lm_char_based = self._ext_scorer.is_character_based() lm_max_order = self._ext_scorer.get_max_order() lm_dict_size = self._ext_scorer.get_dict_size() self.logger.info("language model: " "is_character_based = %d," % lm_char_based + " max_order = %d," % lm_max_order + " dict_size = %d" % lm_dict_size) self.logger.info("end initializing scorer. Start decoding ...") else: self._ext_scorer.reset_params(beam_alpha, beam_beta) assert self._loaded_lm_path == language_model_path # beam search decode num_processes = min(num_processes, len(probs_split)) beam_search_results = ctc_beam_search_decoder_batch( probs_split=probs_split, vocabulary=vocab_list, beam_size=beam_size, num_processes=num_processes, ext_scoring_func=self._ext_scorer, cutoff_prob=cutoff_prob, cutoff_top_n=cutoff_top_n) results = [result[0][1] for result in beam_search_results] else: raise ValueError("Decoding method [%s] is not supported." % decoding_method) return results def _adapt_feeding_dict(self, feeding_dict): """Adapt feeding dict according to network struct. To remove impacts from padding part, we add scale_sub_region layer and sub_seq layer. For sub_seq layer, 'sequence_offset' and 'sequence_length' fields are appended. For each scale_sub_region layer 'convN_index_range' field is appended. :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :return: Adapted feeding dict. :rtype: dict|list """ adapted_feeding_dict = copy.deepcopy(feeding_dict) if isinstance(feeding_dict, dict): adapted_feeding_dict["sequence_offset"] = len(adapted_feeding_dict) adapted_feeding_dict["sequence_length"] = len(adapted_feeding_dict) for i in xrange(self._num_conv_layers): adapted_feeding_dict["conv%d_index_range" %i] = \ len(adapted_feeding_dict) elif isinstance(feeding_dict, list): adapted_feeding_dict.append("sequence_offset") adapted_feeding_dict.append("sequence_length") for i in xrange(self._num_conv_layers): adapted_feeding_dict.append("conv%d_index_range" % i) else: raise ValueError("Type of feeding_dict is %s, not supported." % type(feeding_dict)) return adapted_feeding_dict def _adapt_data(self, data): """Adapt data according to network struct. For each convolution layer in the conv_group, to remove impacts from padding data, we can multiply zero to the padding part of the outputs of each batch normalization layer. We add a scale_sub_region layer after each batch normalization layer to reset the padding data. For rnn layers, to remove impacts from padding data, we can truncate the padding part before output data feeded into the first rnn layer. We use sub_seq layer to achieve this. :param data: Data from data_provider. :type data: list|function :return: Adapted data. :rtype: list|function """ def adapt_instance(instance): if len(instance) < 2 or len(instance) > 3: raise ValueError("Size of instance should be 2 or 3.") padded_audio = instance[0] text = instance[1] # no padding part if len(instance) == 2: audio_len = padded_audio.shape[1] else: audio_len = instance[2] adapted_instance = [padded_audio, text] # Stride size for conv0 is (3, 2) # Stride size for conv1 to convN is (1, 2) # Same as the network, hard-coded here padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1 padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1 valid_w = (audio_len - 1) // 3 + 1 adapted_instance += [ [0], # sequence offset, always 0 [valid_w], # valid sequence length # Index ranges for channel, height and width # Please refer scale_sub_region layer to see details [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w] ] pre_padded_h = padded_conv0_h for i in xrange(self._num_conv_layers - 1): padded_h = (pre_padded_h - 1) // 2 + 1 pre_padded_h = padded_h adapted_instance += [ [1, 32, 1, padded_h, valid_w + 1, padded_conv0_w] ] return adapted_instance if isinstance(data, list): return map(adapt_instance, data) elif inspect.isgeneratorfunction(data): def adapted_reader(): for instance in data(): yield map(adapt_instance, instance) return adapted_reader else: raise ValueError("Type of data is %s, not supported." % type(data)) def _create_parameters(self, model_path=None): """Load or create model parameters.""" if model_path is None: self._parameters = paddle.parameters.create(self._loss) else: self._parameters = paddle.parameters.Parameters.from_tar( gzip.open(model_path)) def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, use_gru, share_rnn_weights): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape # of input batch data will be induced during training. audio_data = paddle.layer.data( name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(vocab_size)) seq_offset_data = paddle.layer.data( name='sequence_offset', type=paddle.data_type.integer_value_sequence(1)) seq_len_data = paddle.layer.data( name='sequence_length', type=paddle.data_type.integer_value_sequence(1)) index_range_datas = [] for i in xrange(num_rnn_layers): index_range_datas.append( paddle.layer.data( name='conv%d_index_range' % i, type=paddle.data_type.dense_vector(6))) self._log_probs, self._loss = deep_speech_v2_network( audio_data=audio_data, text_data=text_data, seq_offset_data=seq_offset_data, seq_len_data=seq_len_data, index_range_datas=index_range_datas, dict_size=vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_layer_size, use_gru=use_gru, share_rnn_weights=share_rnn_weights)