diff --git a/data_utils/data.py b/data_utils/data.py index 1ff4a9138..245daf5c3 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -187,6 +187,9 @@ class DataGenerator(): manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) + + + # sort (by duration) or batch-wise shuffle the manifest if self._epoch == 0 and sortagrad: manifest.sort(key=lambda x: x["duration"]) diff --git a/data_utils/dataset.py b/data_utils/dataset.py index eaec0e401..67c1b57ee 100644 --- a/data_utils/dataset.py +++ b/data_utils/dataset.py @@ -12,11 +12,547 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math +import random +import tarfile +import numpy as np import paddle from paddle.io import Dataset from paddle.io import DataLoader +from paddle.io import BatchSampler +from paddle.io import DistributedBatchSampler +from collections import namedtuple +from functools import partial + +from data_utils.utility import read_manifest +from data_utils.augmentor.augmentation import AugmentationPipeline +from data_utils.featurizer.speech_featurizer import SpeechFeaturizer +from data_utils.speech import SpeechSegment +from data_utils.normalizer import FeatureNormalizer class DeepSpeech2Dataset(Dataset): - def __init__(self): + def __init__(self, + manifest_path, + vocab_filepath, + mean_std_filepath, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + specgram_type='linear', + use_dB_normalization=True, + random_seed=0, + keep_transcription_text=False): super().__init__() + + self._max_duration = max_duration + self._min_duration = min_duration + self._normalizer = FeatureNormalizer(mean_std_filepath) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=augmentation_config, random_seed=random_seed) + self._speech_featurizer = SpeechFeaturizer( + vocab_filepath=vocab_filepath, + specgram_type=specgram_type, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + use_dB_normalization=use_dB_normalization) + self._rng = random.Random(random_seed) + self._keep_transcription_text = keep_transcription_text + # for caching tar files info + self._local_data = namedtuple('local_data', ['tar2info', 'tar2object']) + self._local_data.tar2info = {} + self._local_data.tar2object = {} + + # read manifest + self._manifest = read_manifest( + manifest_path=manifest_path, + max_duration=self._max_duration, + min_duration=self._min_duration) + self._manifest.sort(key=lambda x: x["duration"]) + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._speech_featurizer.vocab_list + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def process_utterance(self, audio_file, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), transcript) + else: + speech_segment = SpeechSegment.from_file(audio_file, transcript) + self._augmentation_pipeline.transform_audio(speech_segment) + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + specgram = self._normalizer.apply(specgram) + return specgram, transcript_part + + def _instance_reader_creator(self, manifest): + """ + Instance reader creator. Create a callable function to produce + instances of data. + + Instance: a tuple of ndarray of audio spectrogram and a list of + token indices for transcript. + """ + + def reader(): + for instance in manifest: + inst = self.process_utterance(instance["audio_filepath"], + instance["text"]) + yield inst + + return reader + + def __len__(self): + return len(self._manifest) + + def __getitem__(self, idx): + instance = self._manifest[idx] + return self.process_utterance(instance["audio_filepath"], + instance["text"]) + + +class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler): + def __init__(self, + dataset, + batch_size, + num_replicas=None, + rank=None, + shuffle=False, + drop_last=False, + sortagrad=False, + shuffle_method="batch_shuffle"): + super().__init__(dataset, batch_size, num_replicas, rank, shuffle, + drop_last) + self._sortagrad = sortagrad + self._shuffle_method = shuffle_method + + def _batch_shuffle(self, manifest, batch_size, clipped=False): + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. + 4. Shuffle the minibatches. + + :param manifest: Manifest contents. List of dict. + :type manifest: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool + :return: Batch shuffled mainifest. + :rtype: list + """ + rng = np.random.RandomState(self.epoch) + manifest.sort(key=lambda x: x["duration"]) + shift_len = rng.randint(0, batch_size - 1) + batch_manifest = list(zip(* [iter(manifest[shift_len:])] * batch_size)) + rng.shuffle(batch_manifest) + batch_manifest = [item for batch in batch_manifest for item in batch] + if not clipped: + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # sort (by duration) or batch-wise shuffle the manifest + if self.shuffle: + if self.epoch == 0 and self.sortagrad: + pass + else: + if self._shuffle_method == "batch_shuffle": + indices = self._batch_shuffle( + indices, self.batch_size, clipped=False) + elif self._shuffle_method == "instance_shuffle": + np.random.RandomState(self.epoch).shuffle(indices) + else: + raise ValueError("Unknown shuffle method %s." % + self._shuffle_method) + assert len(indices) == self.total_size + self.epoch += 1 + + # subsample + def _get_indices_by_batch_size(indices): + subsampled_indices = [] + last_batch_size = self.total_size % (self.batch_size * self.nranks) + assert last_batch_size % self.nranks == 0 + last_local_batch_size = last_batch_size // self.nranks + + for i in range(self.local_rank * self.batch_size, + len(indices) - last_batch_size, + self.batch_size * self.nranks): + subsampled_indices.extend(indices[i:i + self.batch_size]) + + indices = indices[len(indices) - last_batch_size:] + subsampled_indices.extend( + indices[self.local_rank * last_local_batch_size:( + self.local_rank + 1) * last_local_batch_size]) + return subsampled_indices + + if self.nranks > 1: + indices = _get_indices_by_batch_size(indices) + + assert len(indices) == self.num_samples + _sample_iter = iter(indices) + + batch_indices = [] + for idx in _sample_iter: + batch_indices.append(idx) + if len(batch_indices) == self.batch_size: + yield batch_indices + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + def __len__(self): + num_samples = self.num_samples + num_samples += int(not self.drop_last) * (self.batch_size - 1) + return num_samples // self.batch_size + + +class DeepSpeech2BatchSampler(BatchSampler): + def __init__(self, + dataset, + batch_size, + shuffle=False, + drop_last=False, + sortagrad=False, + shuffle_method="batch_shuffle", + num_replicas=1, + rank=0): + self.dataset = dataset + + assert isinstance(batch_size, int) and batch_size > 0, \ + "batch_size should be a positive integer" + self.batch_size = batch_size + assert isinstance(shuffle, bool), \ + "shuffle should be a boolean value" + self.shuffle = shuffle + assert isinstance(drop_last, bool), \ + "drop_last should be a boolean number" + + if num_replicas is not None: + assert isinstance(num_replicas, int) and num_replicas > 0, \ + "num_replicas should be a positive integer" + self.nranks = num_replicas + else: + self.nranks = num_replicas + + if rank is not None: + assert isinstance(rank, int) and rank >= 0, \ + "rank should be a non-negative integer" + self.local_rank = rank + else: + self.local_rank = rank + + self.drop_last = drop_last + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks)) + self.total_size = self.num_samples * self.nranks + self._sortagrad = sortagrad + self._shuffle_method = shuffle_method + + def _batch_shuffle(self, manifest, batch_size, clipped=False): + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. + 4. Shuffle the minibatches. + + :param manifest: Manifest contents. List of dict. + :type manifest: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool + :return: Batch shuffled mainifest. + :rtype: list + """ + rng = np.random.RandomState(self.epoch) + manifest.sort(key=lambda x: x["duration"]) + shift_len = rng.randint(0, batch_size - 1) + batch_manifest = list(zip(* [iter(manifest[shift_len:])] * batch_size)) + rng.shuffle(batch_manifest) + batch_manifest = [item for batch in batch_manifest for item in batch] + if not clipped: + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # sort (by duration) or batch-wise shuffle the manifest + if self.shuffle: + if self.epoch == 0 and self.sortagrad: + pass + else: + if self._shuffle_method == "batch_shuffle": + indices = self._batch_shuffle( + indices, self.batch_size, clipped=False) + elif self._shuffle_method == "instance_shuffle": + np.random.RandomState(self.epoch).shuffle(indices) + else: + raise ValueError("Unknown shuffle method %s." % + self._shuffle_method) + assert len(indices) == self.total_size + self.epoch += 1 + + # subsample + def _get_indices_by_batch_size(indices): + subsampled_indices = [] + last_batch_size = self.total_size % (self.batch_size * self.nranks) + assert last_batch_size % self.nranks == 0 + last_local_batch_size = last_batch_size // self.nranks + + for i in range(self.local_rank * self.batch_size, + len(indices) - last_batch_size, + self.batch_size * self.nranks): + subsampled_indices.extend(indices[i:i + self.batch_size]) + + indices = indices[len(indices) - last_batch_size:] + subsampled_indices.extend( + indices[self.local_rank * last_local_batch_size:( + self.local_rank + 1) * last_local_batch_size]) + return subsampled_indices + + if self.nranks > 1: + indices = _get_indices_by_batch_size(indices) + + assert len(indices) == self.num_samples + _sample_iter = iter(indices) + + batch_indices = [] + for idx in _sample_iter: + batch_indices.append(idx) + if len(batch_indices) == self.batch_size: + yield batch_indices + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + def __len__(self): + num_samples = self.num_samples + num_samples += int(not self.drop_last) * (self.batch_size - 1) + return num_samples // self.batch_size + + def set_epoch(self, epoch): + """ + Sets the epoch number. When :attr:`shuffle=True`, this number is used + as seeds of random numbers. By default, users may not set this, all + replicas (workers) use a different random ordering for each epoch. + If set same number at each epoch, this sampler will yield the same + ordering at all epoches. + Arguments: + epoch (int): Epoch number. + Examples: + .. code-block:: python + + import numpy as np + + from paddle.io import Dataset, DistributedBatchSampler + + # init with dataset + class RandomDataset(Dataset): + def __init__(self, num_samples): + self.num_samples = num_samples + + def __getitem__(self, idx): + image = np.random.random([784]).astype('float32') + label = np.random.randint(0, 9, (1, )).astype('int64') + return image, label + + def __len__(self): + return self.num_samples + + dataset = RandomDataset(100) + sampler = DistributedBatchSampler(dataset, batch_size=64) + + for epoch in range(10): + sampler.set_epoch(epoch) + """ + self.epoch = epoch + + +def create_dataloader(manifest_path, + vocab_filepath, + mean_std_filepath, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + specgram_type='linear', + use_dB_normalization=True, + random_seed=0, + keep_transcription_text=False, + is_training=False, + batch_size=1, + num_workers=0, + sortagrad=False, + shuffle_method=None, + dist=False): + + dataset = DeepSpeech2Dataset( + manifest_path, + vocab_filepath, + mean_std_filepath, + augmentation_config=augmentation_config, + max_duration=max_duration, + min_duration=min_duration, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + specgram_type=specgram_type, + use_dB_normalization=use_dB_normalization, + random_seed=random_seed, + keep_transcription_text=keep_transcription_text) + + if dist: + batch_sampler = DeepSpeech2DistributedBatchSampler( + dataset, + batch_size, + num_replicas=None, + rank=None, + shuffle=is_training, + drop_last=is_training, + sortagrad=is_training, + shuffle_method=shuffle_method) + else: + batch_sampler = DeepSpeech2BatchSampler( + dataset, + shuffle=is_training, + batch_size=batch_size, + drop_last=is_training, + sortagrad=is_training, + shuffle_method=shuffle_method) + + def padding_batch(batch, padding_to=-1, flatten=False, is_training=True): + """ + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. + + If ``padding_to`` is -1, the maximun shape in the batch will be used + as the target shape for padding. Otherwise, `padding_to` will be the + target shape (only refers to the second axis). + + If `flatten` is True, features will be flatten to 1darray. + """ + new_batch = [] + # get target shape + max_length = max([audio.shape[1] for audio, text in batch]) + if padding_to != -1: + if padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be larger " + "than any instance's shape in the batch") + max_length = padding_to + max_text_length = max([len(text) for audio, text in batch]) + # padding + padded_audios = [] + audio_lens = [] + texts, text_lens = [], [] + for audio, text in batch: + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio + if flatten: + padded_audio = padded_audio.flatten() + padded_audios.append(padded_audio) + audio_lens.append(audio.shape[1]) + padded_text = np.zeros([max_text_length]) + padded_text[:len(text)] = text + texts.append(padded_text) + text_lens.append(len(text)) + + padded_audios = np.array(padded_audios).astype('float32') + audio_lens = np.array(audio_lens).astype('int64') + texts = np.array(texts).astype('int32') + text_lens = np.array(text_lens).astype('int64') + return padded_audios, texts, audio_lens, text_lens + + loader = DataLoader( + dataset, + batch_sampler=batch_sampler, + collate_fn=partial(padding_batch, is_training=is_training), + num_workers=num_workers, ) + return loader diff --git a/infer.py b/infer.py index ffcb48eb6..11a4ad7ab 100644 --- a/infer.py +++ b/infer.py @@ -18,6 +18,7 @@ import argparse import functools import paddle.fluid as fluid from data_utils.data import DataGenerator +from data_utils.dataset import create_dataloader from model_utils.model import DeepSpeech2Model from model_utils.model_check import check_cuda, check_version from utils.error_rate import wer, cer @@ -80,75 +81,114 @@ def infer(): # check if paddlepaddle version is satisfied check_version() - if args.use_gpu: - place = fluid.CUDAPlace(0) - else: - place = fluid.CPUPlace() - - data_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config='{}', - specgram_type=args.specgram_type, - keep_transcription_text=True, - place = place, - is_training = False) - batch_reader = data_generator.batch_reader_creator( - manifest_path=args.infer_manifest, - batch_size=args.num_samples, - sortagrad=False, - shuffle_method=None) - infer_data = next(batch_reader()) - - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, + # data_generator = DataGenerator( + # vocab_filepath=args.vocab_path, + # mean_std_filepath=args.mean_std_path, + # augmentation_config='{}', + # specgram_type=args.specgram_type, + # keep_transcription_text=True, + # place = place, + # is_training = False) + # batch_reader = data_generator.batch_reader_creator( + # manifest_path=args.infer_manifest, + # batch_size=args.num_samples, + # sortagrad=False, + # shuffle_method=None) + + batch_reader = create_dataloader( + manifest_path=args.infer_manifest, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + specgram_type=args.specgram_type, + use_dB_normalization=True, + random_seed=0, + keep_transcription_text=False, + is_training=False, + batch_size=args.num_samples, + sortagrad=False, + shuffle_method=None) + + #for audio, text, audio_len, text_len in batch_reader: + # print(audio.shape) + # print(text.shape) + # print(audio_len) + # print(text_len) + # break + + reader = batch_reader() + infer_data = reader.next() + print(infer_data) + + from model_utils.network2 import DeepSpeech2 + feat_dim=161 + model = DeepSpeech2( + feat_size=feat_dim, + dict_size=batch_reader.dataset.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, + #rnn_size=1024, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, - place=place, - init_from_pretrained_model=args.model_path) - - # decoders only accept string encoded in utf-8 - vocab_list = [chars for chars in data_generator.vocab_list] - - if args.decoding_method == "ctc_greedy": - ds2_model.logger.info("start inference ...") - probs_split = ds2_model.infer_batch_probs( - infer_data=infer_data, - feeding_dict=data_generator.feeding) - - result_transcripts = ds2_model.decode_batch_greedy( - probs_split=probs_split, - vocab_list=vocab_list) - else: - ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, - vocab_list) - ds2_model.logger.info("start inference ...") - probs_split= ds2_model.infer_batch_probs( - infer_data=infer_data, - feeding_dict=data_generator.feeding) - - result_transcripts= ds2_model.decode_batch_beam_search( - probs_split=probs_split, - beam_alpha=args.alpha, - beam_beta=args.beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - cutoff_top_n=args.cutoff_top_n, - vocab_list=vocab_list, - num_processes=args.num_proc_bsearch) - - error_rate_func = cer if args.error_rate_type == 'cer' else wer - target_transcripts = infer_data[1] - for target, result in zip(target_transcripts, result_transcripts): - print("\nTarget Transcription: %s\nOutput Transcription: %s" % - (target, result)) - print("Current error rate [%s] = %f" % - (args.error_rate_type, error_rate_func(target, result))) - - ds2_model.logger.info("finish inference") + ) + + output = model(*infer_data) + print(output) + + # ds2_model = DeepSpeech2Model( + # vocab_size=data_generator.vocab_size, + # num_conv_layers=args.num_conv_layers, + # num_rnn_layers=args.num_rnn_layers, + # rnn_layer_size=args.rnn_layer_size, + # use_gru=args.use_gru, + # share_rnn_weights=args.share_rnn_weights, + # place=place, + # init_from_pretrained_model=args.model_path) + + # # decoders only accept string encoded in utf-8 + # vocab_list = [chars for chars in data_generator.vocab_list] + + # if args.decoding_method == "ctc_greedy": + # ds2_model.logger.info("start inference ...") + # probs_split = ds2_model.infer_batch_probs( + # infer_data=infer_data, + # feeding_dict=data_generator.feeding) + + # result_transcripts = ds2_model.decode_batch_greedy( + # probs_split=probs_split, + # vocab_list=vocab_list) + # else: + # ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, + # vocab_list) + # ds2_model.logger.info("start inference ...") + # probs_split= ds2_model.infer_batch_probs( + # infer_data=infer_data, + # feeding_dict=data_generator.feeding) + + # result_transcripts= ds2_model.decode_batch_beam_search( + # probs_split=probs_split, + # beam_alpha=args.alpha, + # beam_beta=args.beta, + # beam_size=args.beam_size, + # cutoff_prob=args.cutoff_prob, + # cutoff_top_n=args.cutoff_top_n, + # vocab_list=vocab_list, + # num_processes=args.num_proc_bsearch) + + # error_rate_func = cer if args.error_rate_type == 'cer' else wer + # target_transcripts = infer_data[1] + # for target, result in zip(target_transcripts, result_transcripts): + # print("\nTarget Transcription: %s\nOutput Transcription: %s" % + # (target, result)) + # print("Current error rate [%s] = %f" % + # (args.error_rate_type, error_rate_func(target, result))) + + # ds2_model.logger.info("finish inference") def main(): print_arguments(args) diff --git a/model_utils/network2.py b/model_utils/network2.py index 8cbbbf818..bab97a3cc 100644 --- a/model_utils/network2.py +++ b/model_utils/network2.py @@ -497,8 +497,6 @@ class DeepSpeech2(nn.Layer): share_rnn_weights=share_rnn_weights) self.fc = nn.Linear(rnn_size * 2, dict_size + 1) - self.loss = nn.CTCLoss(blank=dict_size, reduction='none') - def predict(self, audio, audio_len): # [B, D, T] -> [B, C=1, D, T] audio = audio.unsqueeze(1) @@ -534,14 +532,24 @@ class DeepSpeech2(nn.Layer): text_len: shape [B] """ logits, probs = self.predict(audio, audio_len) - # warp-ctc do softmax on activations - # warp-ctc need activation with shape [T, B, V + 1] - logits = logits.transpose([1, 0, 2]) print(logits.shape) print(text.shape) print(audio_len.shape) print(text_len.shape) + return logits + + +class DeepSpeechLoss(nn.Layer): + def __init__(self, vocab_size): + super().__init__() + self.loss = nn.CTCLoss(blank=vocab_size, reduction='none') + + def forward(self, logits, text, audio_len, text_len): + # warp-ctc do softmax on activations + # warp-ctc need activation with shape [T, B, V + 1] + logits = logits.transpose([1, 0, 2]) + ctc_loss = self.loss(logits, text, audio_len, text_len) ctc_loss /= text_len # norm_by_times ctc_loss = ctc_loss.sum() - return probs, ctc_loss + return ctc_loss diff --git a/requirements.txt b/requirements.txt index 8c57208a6..af2993b6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ scipy==1.2.1 -resampy==0.1.5 +resampy==0.2.2 SoundFile==0.9.0.post1 python_speech_features diff --git a/setup.sh b/setup.sh index 3827dc1b3..21d9c19ec 100644 --- a/setup.sh +++ b/setup.sh @@ -6,6 +6,10 @@ else SUDO='sudo' fi +if [ -e /etc/lsb-release ];then + ${SUDO} apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev +fi + # install python dependencies if [ -f "requirements.txt" ]; then pip3 install -r requirements.txt @@ -18,9 +22,6 @@ fi # install package libsndfile python3 -c "import soundfile" if [ $? != 0 ]; then - if [ -e /etc/lsb-release ];then - ${SUDO} apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev - fi echo "Install package libsndfile into default system path." wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" if [ $? != 0 ]; then